Skip to content

Commit 7802e20

Browse files
committed
PRE-MERGE microsoft#16916 Implement grapheme clusters
2 parents e321c00 + bb47e9e commit 7802e20

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+3630
-738
lines changed

.github/actions/spelling/expect/expect.txt

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ bytebuffer
146146
cac
147147
cacafire
148148
CALLCONV
149+
CANDRABINDU
149150
capslock
150151
CARETBLINKINGENABLED
151152
CARRIAGERETURN
@@ -156,6 +157,7 @@ CBash
156157
cbiex
157158
CBN
158159
cbt
160+
Ccc
159161
CCCBB
160162
cch
161163
CCHAR
@@ -293,7 +295,6 @@ CREATESTRUCT
293295
CREATESTRUCTW
294296
createvpack
295297
crisman
296-
CRLFs
297298
crloew
298299
CRTLIBS
299300
csbi
@@ -593,6 +594,7 @@ fesb
593594
FFAF
594595
ffd
595596
FFDE
597+
FFFD
596598
FFFDb
597599
fgbg
598600
FGCOLOR
@@ -613,6 +615,7 @@ FINDREGEX
613615
FINDSTRINGEXACT
614616
FINDUP
615617
FIter
618+
FITZPATRICK
616619
FIXEDFILEINFO
617620
Flg
618621
flyouts
@@ -879,10 +882,12 @@ jconcpp
879882
JLO
880883
JOBOBJECT
881884
JOBOBJECTINFOCLASS
885+
JONGSEONG
882886
JPN
883887
jsoncpp
884888
jsprovider
885889
jumplist
890+
JUNGSEONG
886891
KAttrs
887892
kawa
888893
Kazu
@@ -901,6 +906,7 @@ keyups
901906
KILLACTIVE
902907
KILLFOCUS
903908
kinda
909+
KIYEOK
904910
KLF
905911
KLMNO
906912
KLMNOPQRST
@@ -1010,6 +1016,7 @@ luma
10101016
lval
10111017
LVB
10121018
LVERTICAL
1019+
LVT
10131020
LWA
10141021
LWIN
10151022
lwkmvj
@@ -1205,6 +1212,7 @@ ntuser
12051212
NTVDM
12061213
ntverp
12071214
nugetversions
1215+
NUKTA
12081216
nullness
12091217
nullonfailure
12101218
nullopts
@@ -1467,7 +1475,6 @@ READMODE
14671475
rectread
14681476
redef
14691477
redefinable
1470-
Redir
14711478
redist
14721479
REDSCROLL
14731480
REFCLSID
@@ -1485,6 +1492,7 @@ renderengine
14851492
rendersize
14861493
reparented
14871494
reparenting
1495+
REPH
14881496
replatformed
14891497
Replymessage
14901498
repositorypath
@@ -1514,6 +1522,7 @@ rgw
15141522
RIGHTALIGN
15151523
RIGHTBUTTON
15161524
riid
1525+
ris
15171526
RIS
15181527
roadmap
15191528
robomac
@@ -1919,6 +1928,7 @@ vga
19191928
vgaoem
19201929
viewkind
19211930
viewports
1931+
VIRAMA
19221932
Virt
19231933
VIRTTERM
19241934
vkey
@@ -1969,8 +1979,8 @@ wchars
19691979
WCIA
19701980
WCIW
19711981
WCSHELPER
1972-
wcsicmp
19731982
wcsrev
1983+
wcswidth
19741984
wddm
19751985
wddmcon
19761986
WDDMCONSOLECONTEXT
@@ -2125,6 +2135,7 @@ XFORM
21252135
XIn
21262136
XManifest
21272137
XMath
2138+
XNamespace
21282139
xorg
21292140
XPan
21302141
XResource
@@ -2156,6 +2167,7 @@ Zabcdefghijklmn
21562167
Zabcdefghijklmnopqrstuvwxyz
21572168
ZCmd
21582169
ZCtrl
2170+
ZWJs
21592171
zxcvbnm
21602172
ZYXWVU
21612173
ZYXWVUTd

src/buffer/out/Row.cpp

Lines changed: 73 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@
55
#include "Row.hpp"
66

77
#include <isa_availability.h>
8-
#include <til/unicode.h>
98

10-
#include "textBuffer.hpp"
11-
#include "../../types/inc/GlyphWidth.hpp"
9+
#include "../../types/inc/CodepointWidthDetector.hpp"
1210

1311
// It would be nice to add checked array access in the future, but it's a little annoying to do so without impacting
1412
// performance (including Debug performance). Other languages are a little bit more ergonomic there than C++.
@@ -568,6 +566,7 @@ void ROW::ReplaceAttributes(const til::CoordType beginIndex, const til::CoordTyp
568566
void ROW::ReplaceCharacters(til::CoordType columnBegin, til::CoordType width, const std::wstring_view& chars)
569567
try
570568
{
569+
assert(width >= 1 && width <= 2);
571570
WriteHelper h{ *this, columnBegin, _columnCount, chars };
572571
if (!h.IsValid())
573572
{
@@ -666,56 +665,89 @@ catch (...)
666665

667666
[[msvc::forceinline]] void ROW::WriteHelper::_replaceTextUnicode(size_t ch, std::wstring_view::const_iterator it) noexcept
668667
{
669-
const auto end = chars.end();
668+
auto& cwd = CodepointWidthDetector::Singleton();
670669

671-
while (it != end)
670+
// Check if the new text joins with the existing contents of the row to form a single grapheme cluster.
671+
if (it == chars.begin())
672672
{
673-
unsigned int width = 1;
674-
auto ptr = &*it;
675-
const auto wch = *ptr;
676-
size_t advance = 1;
673+
auto colPrev = colBeg;
674+
while (colPrev > 0 && row._uncheckedIsTrailer(--colPrev))
675+
{
676+
}
677677

678-
++it;
678+
const auto chPrev = row._uncheckedCharOffset(colPrev);
679+
const std::wstring_view charsPrev{ row._chars.data() + chPrev, ch - chPrev };
679680

680-
// Even in our slow-path we can avoid calling IsGlyphFullWidth if the current character is ASCII.
681-
// It also allows us to skip the surrogate pair decoding at the same time.
682-
if (wch >= 0x80)
681+
GraphemeState state;
682+
cwd.GraphemeNext(state, charsPrev);
683+
cwd.GraphemeNext(state, chars);
684+
685+
if (state.len > 0)
683686
{
684-
if (til::is_surrogate(wch))
687+
colBegDirty = colPrev;
688+
colEnd = colPrev;
689+
690+
const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + state.width);
691+
if (colEndNew > colLimit)
685692
{
686-
if (it != end && til::is_leading_surrogate(wch) && til::is_trailing_surrogate(*it))
687-
{
688-
advance = 2;
689-
++it;
690-
}
691-
else
692-
{
693-
ptr = &UNICODE_REPLACEMENT;
694-
}
693+
colEndDirty = colLimit;
694+
charsConsumed = ch - chBeg;
695+
return;
695696
}
696697

697-
width = IsGlyphFullWidth({ ptr, advance }) + 1u;
698-
}
698+
// Fill our char-offset buffer with 1 entry containing the mapping from the
699+
// current column (colEnd) to the start of the glyph in the string (ch)...
700+
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(chPrev);
701+
// ...followed by 0-N entries containing an indication that the
702+
// columns are just a wide-glyph extension of the preceding one.
703+
while (colEnd < colEndNew)
704+
{
705+
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(chPrev | CharOffsetsTrailer);
706+
}
699707

700-
const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + width);
701-
if (colEndNew > colLimit)
702-
{
703-
colEndDirty = colLimit;
704-
charsConsumed = ch - chBeg;
705-
return;
708+
ch += state.len;
709+
it += state.len;
706710
}
711+
}
712+
else
713+
{
714+
// The non-ASCII character we have encountered may be a combining mark, like "a^" which is then displayed as "â".
715+
// In order to recognize both characters as a single grapheme, we need to back up by 1 ASCII character
716+
// and let MeasureNext() find the next proper grapheme boundary.
717+
--colEnd;
718+
--ch;
719+
--it;
720+
}
721+
722+
if (const auto end = chars.end(); it != end)
723+
{
724+
GraphemeState state{ .beg = &*it };
707725

708-
// Fill our char-offset buffer with 1 entry containing the mapping from the
709-
// current column (colEnd) to the start of the glyph in the string (ch)...
710-
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch);
711-
// ...followed by 0-N entries containing an indication that the
712-
// columns are just a wide-glyph extension of the preceding one.
713-
while (colEnd < colEndNew)
726+
do
714727
{
715-
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
716-
}
728+
cwd.GraphemeNext(state, chars);
729+
730+
const auto colEndNew = gsl::narrow_cast<uint16_t>(colEnd + state.width);
731+
if (colEndNew > colLimit)
732+
{
733+
colEndDirty = colLimit;
734+
charsConsumed = ch - chBeg;
735+
return;
736+
}
737+
738+
// Fill our char-offset buffer with 1 entry containing the mapping from the
739+
// current column (colEnd) to the start of the glyph in the string (ch)...
740+
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch);
741+
// ...followed by 0-N entries containing an indication that the
742+
// columns are just a wide-glyph extension of the preceding one.
743+
while (colEnd < colEndNew)
744+
{
745+
til::at(row._charOffsets, colEnd++) = gsl::narrow_cast<uint16_t>(ch | CharOffsetsTrailer);
746+
}
717747

718-
ch += advance;
748+
ch += state.len;
749+
it += state.len;
750+
} while (it != end);
719751
}
720752

721753
colEndDirty = colEnd;
@@ -1058,7 +1090,7 @@ std::wstring_view ROW::GetText() const noexcept
10581090

10591091
std::wstring_view ROW::GetText(til::CoordType columnBegin, til::CoordType columnEnd) const noexcept
10601092
{
1061-
const til::CoordType columns = _columnCount;
1093+
const auto columns = GetReadableColumnCount();
10621094
const auto colBeg = clamp(columnBegin, 0, columns);
10631095
const auto colEnd = clamp(columnEnd, colBeg, columns);
10641096
const size_t chBeg = _uncheckedCharOffset(gsl::narrow_cast<size_t>(colBeg));

0 commit comments

Comments
 (0)