Skip to content

Commit 62b2cdb

Browse files
committed
Implement grapheme clusters
1 parent 0a83946 commit 62b2cdb

30 files changed

+2964
-335
lines changed

.github/actions/spelling/expect/expect.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ bytebuffer
144144
cac
145145
cacafire
146146
CALLCONV
147+
CANDRABINDU
147148
capslock
148149
CARETBLINKINGENABLED
149150
CARRIAGERETURN
@@ -155,6 +156,7 @@ cbiex
155156
CBN
156157
CBoolean
157158
cbt
159+
Ccc
158160
CCCBB
159161
cch
160162
CCHAR
@@ -180,6 +182,7 @@ chaof
180182
charinfo
181183
CHARSETINFO
182184
chh
185+
chonker
183186
chshdng
184187
CHT
185188
Cic
@@ -598,7 +601,9 @@ FEEF
598601
fesb
599602
FFAF
600603
FFDE
604+
FFFD
601605
FFFDb
606+
FFrom
602607
fgbg
603608
FGCOLOR
604609
FGHIJ
@@ -617,6 +622,7 @@ FINDDOWN
617622
FINDSTRINGEXACT
618623
FINDUP
619624
FIter
625+
FITZPATRICK
620626
FIXEDCONVERTED
621627
FIXEDFILEINFO
622628
Flg
@@ -888,11 +894,13 @@ jconcpp
888894
JLO
889895
JOBOBJECT
890896
JOBOBJECTINFOCLASS
897+
JONGSEONG
891898
JPN
892899
jsoncpp
893900
Jsons
894901
jsprovider
895902
jumplist
903+
JUNGSEONG
896904
KAttrs
897905
kawa
898906
Kazu
@@ -911,6 +919,7 @@ keyups
911919
KILLACTIVE
912920
KILLFOCUS
913921
kinda
922+
KIYEOK
914923
KLF
915924
KLMNO
916925
KLMNOPQRST
@@ -1020,6 +1029,7 @@ luma
10201029
lval
10211030
LVB
10221031
LVERTICAL
1032+
LVT
10231033
LWA
10241034
LWIN
10251035
lwkmvj
@@ -1049,6 +1059,7 @@ mdmerge
10491059
MDs
10501060
MEASUREITEM
10511061
megamix
1062+
Meh
10521063
memallocator
10531064
meme
10541065
MENUCHAR
@@ -1164,6 +1175,7 @@ NOMINMAX
11641175
NOMOVE
11651176
NONALERT
11661177
nonbreaking
1178+
noncharacter
11671179
nonclient
11681180
NONINFRINGEMENT
11691181
NONPREROTATED
@@ -1212,6 +1224,7 @@ ntuser
12121224
NTVDM
12131225
ntverp
12141226
nugetversions
1227+
NUKTA
12151228
nullness
12161229
nullonfailure
12171230
nullopts
@@ -1489,6 +1502,7 @@ renderengine
14891502
rendersize
14901503
reparented
14911504
reparenting
1505+
REPH
14921506
replatformed
14931507
Replymessage
14941508
repositorypath
@@ -1517,6 +1531,7 @@ rgw
15171531
RIGHTALIGN
15181532
RIGHTBUTTON
15191533
riid
1534+
ris
15201535
RIS
15211536
roadmap
15221537
robomac
@@ -1883,6 +1898,7 @@ UPDATEDISPLAY
18831898
UPDOWN
18841899
UPKEY
18851900
upss
1901+
UPSS
18861902
uregex
18871903
URegular
18881904
usebackq
@@ -1925,6 +1941,7 @@ vga
19251941
vgaoem
19261942
viewkind
19271943
viewports
1944+
VIRAMA
19281945
Virt
19291946
VIRTTERM
19301947
vkey
@@ -2165,6 +2182,7 @@ Zabcdefghijklmn
21652182
Zabcdefghijklmnopqrstuvwxyz
21662183
ZCmd
21672184
ZCtrl
2185+
ZWJs
21682186
zxcvbnm
21692187
ZYXWVU
21702188
ZYXWVUTd

doc/cascadia/profiles.schema.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2344,6 +2344,11 @@
23442344
"description": "Force the terminal to use the legacy input encoding. Certain keys in some applications may stop working when enabling this setting.",
23452345
"type": "boolean"
23462346
},
2347+
"experimental.graphemes": {
2348+
"default": true,
2349+
"description": "When set to true, the terminal will use grapheme cluster boundaries for cursor movement. Otherwise, the terminal will use codepoint boundaries.",
2350+
"type": "boolean"
2351+
},
23472352
"experimental.useBackgroundImageForWindow": {
23482353
"default": false,
23492354
"description": "When set to true, the background image for the currently focused profile is expanded to encompass the entire window, beneath other panes.",

src/buffer/out/Row.cpp

Lines changed: 23 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@
55
#include "Row.hpp"
66

77
#include <isa_availability.h>
8-
#include <til/unicode.h>
98

10-
#include "textBuffer.hpp"
11-
#include "../../types/inc/GlyphWidth.hpp"
9+
#include "../../types/inc/CodepointWidthDetector.hpp"
1210

1311
// It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
1412
// performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
@@ -646,60 +644,45 @@ catch (...)
646644
//
647645
// We can infer the "end" from the amount of columns we're given (colLimit - colBeg),
648646
// because ASCII is always 1 column wide per character.
649-
auto it = chars.begin();
650-
const auto end = it + std::min<size_t>(chars.size(), colLimit - colBeg);
647+
const auto len = std::min<size_t>(chars.size(), colLimit - colBeg);
651648
size_t ch = chBeg;
652649

653-
while (it != end)
650+
for (size_t off = 0; off < len; ++off)
654651
{
655-
if (*it >= 0x80) [[unlikely]]
652+
if (chars[off] >= 0x80) [[unlikely]]
656653
{
657-
_replaceTextUnicode(ch, it);
654+
_replaceTextUnicode(ch, off);
658655
return;
659656
}
660657

661658
til::at(row._charOffsets, colEnd) = gsl::narrow_cast<uint16_t>(ch);
662659
++colEnd;
663660
++ch;
664-
++it;
665661
}
666662

667663
colEndDirty = colEnd;
668664
charsConsumed = ch - chBeg;
669665
}
670666

671-
[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
667+
[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, size_t off)
672668
{
673-
const auto end = chars.end();
669+
auto& cwd = CodepointWidthDetector::Singleton();
670+
const auto len = chars.size();
674671

675-
while (it != end)
672+
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
673+
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
674+
// and let MeasureNext() find the next proper grapheme boundary.
675+
if (off != 0)
676676
{
677-
unsigned int width = 1;
678-
auto ptr = &*it;
679-
const auto wch = *ptr;
680-
size_t advance = 1;
681-
682-
++it;
683-
684-
// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
685-
// It also allows us to skip the surrogate pair decoding at the same time.
686-
if (wch >= 0x80)
687-
{
688-
if (til::is_surrogate(wch))
689-
{
690-
if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
691-
{
692-
advance = 2;
693-
++it;
694-
}
695-
else
696-
{
697-
ptr = &UNICODE_REPLACEMENT;
698-
}
699-
}
677+
--colEnd;
678+
--ch;
679+
--off;
680+
}
700681

701-
width = IsGlyphFullWidth({ ptr, advance }) + 1u;
702-
}
682+
while (off < len)
683+
{
684+
int width;
685+
const auto end = cwd.GraphemeNext(chars, off, &width);
703686

704687
const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
705688
if (colEndNew > colLimit)
@@ -719,7 +702,8 @@ catch (...)
719702
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
720703
}
721704

722-
ch += advance;
705+
ch += end - off;
706+
off = end;
723707
}
724708

725709
colEndDirty = colEnd;
@@ -1062,7 +1046,7 @@ std::wstring_view ROW::GetText() const noexcept
10621046

10631047
std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
10641048
{
1065-
const til::CoordType columns = _columnCount;
1049+
const auto columns = GetReadableColumnCount();
10661050
const auto colBeg = clamp(columnBegin, 0, columns);
10671051
const auto colEnd = clamp(columnEnd, colBeg, columns);
10681052
const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));

src/buffer/out/Row.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ class ROW final
181181
bool IsValid() const noexcept;
182182
void ReplaceCharacters(til::CoordType width) noexcept;
183183
void ReplaceText() noexcept;
184-
void _replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept;
184+
void _replaceTextUnicode(size_t ch, size_t off);
185185
void CopyTextFrom(const std::span<const uint16_t>& charOffsets) noexcept;
186186
static void _copyOffsets(uint16_t* dst, const uint16_t* src, uint16_t size, uint16_t offset) noexcept;
187187
void Finish();

src/buffer/out/textBuffer.cpp

Lines changed: 26 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,13 @@
22
// Licensed under the MIT license.
33

44
#include "precomp.h"
5-
65
#include "textBuffer.hpp"
76

87
#include <til/hash.h>
9-
#include <til/unicode.h>
108

119
#include "UTextAdapter.h"
12-
#include "../../types/inc/GlyphWidth.hpp"
10+
#include "../../types/inc/CodepointWidthDetector.hpp"
1311
#include "../renderer/base/renderer.hpp"
14-
#include "../types/inc/convert.hpp"
1512
#include "../types/inc/utils.hpp"
1613

1714
using namespace Microsoft::Console;
@@ -408,17 +405,17 @@ void TextBuffer::_PrepareForDoubleByteSequence(const DbcsAttribute dbcsAttribute
408405
// Given the character offset `position` in the `chars` string, this function returns the starting position of the next grapheme.
409406
// For instance, given a `chars` of L"x\uD83D\uDE42y" and a `position` of 1 it'll return 3.
410407
// GraphemePrev would do the exact inverse of this operation.
411-
// In the future, these functions are expected to also deliver information about how many columns a grapheme occupies.
412-
// (I know that mere UTF-16 code point iteration doesn't handle graphemes, but that's what we're working towards.)
413408
size_t TextBuffer::GraphemeNext(const std::wstring_view& chars, size_t position) noexcept
414409
{
415-
return til::utf16_iterate_next(chars, position);
410+
auto& cwd = CodepointWidthDetector::Singleton();
411+
return cwd.GraphemeNext(chars, position, nullptr);
416412
}
417413

418414
// It's the counterpart to GraphemeNext. See GraphemeNext.
419415
size_t TextBuffer::GraphemePrev(const std::wstring_view& chars, size_t position) noexcept
420416
{
421-
return til::utf16_iterate_prev(chars, position);
417+
auto& cwd = CodepointWidthDetector::Singleton();
418+
return cwd.GraphemePrev(chars, position, nullptr);
422419
}
423420

424421
// Ever wondered how much space a piece of text needs before inserting it? This function will tell you!
@@ -445,7 +442,7 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
445442
{
446443
}
447444

448-
const auto dist = gsl::narrow_cast<size_t>(it - beg);
445+
auto dist = gsl::narrow_cast<size_t>(it - beg);
449446
auto col = gsl::narrow_cast<til::CoordType>(dist);
450447

451448
if (it == asciiEnd) [[likely]]
@@ -455,33 +452,23 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
455452
}
456453

457454
// Unicode slow-path where we need to count text and columns separately.
458-
for (;;)
459-
{
460-
auto ptr = &*it;
461-
const auto wch = *ptr;
462-
size_t len = 1;
463-
464-
col++;
455+
auto& cwd = CodepointWidthDetector::Singleton();
456+
const auto len = chars.size();
465457

466-
// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
467-
// It also allows us to skip the surrogate pair decoding at the same time.
468-
if (wch >= 0x80)
469-
{
470-
if (til::is_surrogate(wch))
471-
{
472-
const auto it2 = it + 1;
473-
if (til::is_leading_surrogate(wch) && it2 != end && til::is_trailing_surrogate(*it2))
474-
{
475-
len = 2;
476-
}
477-
else
478-
{
479-
ptr = &UNICODE_REPLACEMENT;
480-
}
481-
}
458+
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
459+
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
460+
// and let GraphemeNext() find the next proper grapheme boundary.
461+
if (dist != 0)
462+
{
463+
dist--;
464+
col--;
465+
}
482466

483-
col += IsGlyphFullWidth({ ptr, len });
484-
}
467+
while (dist < len)
468+
{
469+
int width;
470+
dist = cwd.GraphemeNext(chars, dist, &width);
471+
col += width;
485472

486473
// If we ran out of columns, we need to always return `columnLimit` and not `cols`,
487474
// because if we tried inserting a wide glyph into just 1 remaining column it will
@@ -490,17 +477,13 @@ size_t TextBuffer::FitTextIntoColumns(const std::wstring_view& chars, til::Coord
490477
if (col > columnLimit)
491478
{
492479
columns = columnLimit;
493-
return gsl::narrow_cast<size_t>(it - beg);
494-
}
495-
496-
// But if we simply ran out of text we just need to return the actual number of columns.
497-
it += len;
498-
if (it == end)
499-
{
500-
columns = col;
501-
return chars.size();
480+
return dist;
502481
}
503482
}
483+
484+
// But if we simply ran out of text we just need to return the actual number of columns.
485+
columns = col;
486+
return chars.size();
504487
}
505488

506489
// Pretend as if `position` is a regular cursor in the TextBuffer.

src/cascadia/TerminalCore/ICoreSettings.idl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ namespace Microsoft.Terminal.Core
2020
String WordDelimiters;
2121

2222
Boolean ForceVTInput;
23+
Boolean Graphemes;
2324
Boolean TrimBlockSelection;
2425
Boolean DetectURLs;
2526
Boolean VtPassthrough;

0 commit comments

Comments
 (0)