From 0841f61eb4d2e8cb97ffe592dd1e493bb484c645 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 26 Jan 2025 11:35:18 +0100 Subject: [PATCH] PDFBOX-5747: Surrogate pairs with combining diacritics are incorrectly ordered on text extraction - Changed TextPosition.insertDiacritic() to preserve surrogate pairs - Added unit test - Included example test PDF file attached to PDFBOX-5747 --- .../org/apache/pdfbox/text/TextPosition.java | 15 ++++++++++++--- ...BOX-5747-unicode-surrogate-with-diacritic.pdf | Bin 0 -> 6729 bytes ...icode-surrogate-with-diacritic.pdf-sorted.txt | 3 +++ ...5747-unicode-surrogate-with-diacritic.pdf.txt | 3 +++ 4 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf create mode 100644 pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf-sorted.txt create mode 100644 pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf.txt diff --git a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java index 953bd8d251f..dc42aca0e9b 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java @@ -759,16 +759,25 @@ private void insertDiacritic(int i, TextPosition diacritic) float[] widths2 = new float[widths.length + 1]; System.arraycopy(widths, 0, widths2, 0, i); + // First we add a zero-width entry for the diacritic in the widths array + widths2[i] = widths[i]; + widths2[i + 1] = 0; + System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1); + // Unicode combining diacritics always go after the base character, regardless of whether // the string is in presentation order or logical order sb.append(unicode.charAt(i)); - widths2[i] = widths[i]; + + // If a surrogate starts at the current position, make sure we preserve it + if (i < unicode.length() - 1 && Character.isSurrogatePair(unicode.charAt(i), unicode.charAt(i + 1))) { + sb.append(unicode.charAt(i + 1)); + i++; + } + sb.append(combineDiacritic(diacritic.getUnicode())); - widths2[i + 1] = 0; // get the rest of the string sb.append(unicode.substring(i + 1)); - System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1); unicode = sb.toString(); widths = widths2; diff --git a/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf b/pdfbox/src/test/resources/input/PDFBOX-5747-unicode-surrogate-with-diacritic.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9fc1d4069ca40635a00159ba8cc5760bf64ba910 GIT binary patch literal 6729 zcmb7J2_TeR+qP#5B|_*?l-+CwAq<1DmVIY17{(A|tL#FgqC%D|iC3>wwuEd^B(f!2 z_N^3U&-%}dwES=1_rL$lnCCgmJn3zYO&|ZN@A}*dr$Cc+pc|q5rcR9K2_o!2_zCSq$?2{2n0rICzYT-AS8o^^e1Q-5{Q5S3JRnO z;!6q*A{GECRTTgXNI9U1-$WRR4ZGh`>K=H)FTI^4KoVd8zUKzg50I)rfXrSM(T@1} z{(RXV&j+N80z5H*l)k;IA;FzIn(sc9+q0C?1Hj}4b*z^+0f2qK|L=ZQY<Ar<@ka*&Jx9=#;q6cMFQu zmK4hvbb=H_GaVDd51o}cRp4}V?!7j z*AgYT?nTowxX;A)E39j#Y7CU1ZXnh}6p!8P6x7WKfaxMqP8+0fu0Nv985?8j=D5{i z&3D_phkKv%PIt|xPg^@pbL+i*+<9FWymq|bJe|Hse4hXVdl-rFx&1^|Ehh`Vt!d^Gz!i6 z&=9*gDHNG4-Ey<>-c$MR$$-@zr?=ECN3DwNrJoNrDzxC7dcd={b#+eNcqQ({2|nm_ ze35DqR-71cCms@P_m}OI$rRnxsVVg-i%3jR^QLHgP(>;jk|BiCS_6S4zCOq~nQ#+j zD`O63>1Wy=g|FofD%WuiPUtRD1>erp7+q6+g8qkApwzMngEo9@xbGO@D`-0 zaW$y64oS|;;h5m8b>@P(Fx@t7*Cwexr~Z&HH25W_tEyAw+q#=HU)Wba#4~VlD}7Lm z^<>fi$l_d!zAc8Chm;3~y@xvRchIUFHIgf4^R>%DX|RhxFg;4FR%Y~qaw|)pdGE&v~5auoQhO_+^z*@%5yh6;?IRW%+QZfL{W~SfuuW zwc0=p+^QVZhnGtyinez5VU@ERC(kquWD} zORC22MZe_pekix8nj9D7td|IrdxxC0y2|CAiHiJGYFE z;UtY^da77$ z@M{B}PXp*pmiQ$-hC(_kLxJW2I{D6fDfX0gG!{sCqql%f>(=YY z=uNnMlkc$o6K~eyCLy+<(+T3@y~a)xRtMhf%j z&|wF(mFKC_wcn=fXflLq{T81= z){!ybO@>{tvefog#rxC8Ch!NPMk~~dpCFR@0yr}O}|ZX>vHo2f2-ph(o0-50eX6OI0~-4#0^4vs*p)y zrs0Tm?0L#j4d_~H2l!ODbJhIu2vE0Y6Ju8Ybs@)`$AgHnWBGjfEFUVBhmC=5lgI@v zbp3Ry%=nEE{TC;zs@{N3RE5Dp1dm)y50qOBBtst!Fo7zeC*a$*#xMidS~tqkaN+xlHfK z+W+3hGXRI?r4*@2NiSnDr7>m`_naDm*ZNF$;^Zcp4EMXS@z-@u|@Wd@4a#qcb2sKyw7l>wuMc;=S3ZF|f&l1Uc zI&s9Y$A4aG3?HpqB*pmo96Q64D9id*HayGxJjb2Jcg$yL)g}uGgB0cP@vO=@y1z>c ztFtKA_HWYF^W`~Z++e4&=0EOz>8n$_*S(1uGyfbn`ijVus7u#hh*PKVw=W6vnNwxP z@Wk<#oWD_YdQppQeyM1@cYaYX-0xKo|5LSf1tIe`w>)KeiPd|@9qoeh@k}nZu!o4# zm*JY(4!2g3UvS*QeC$WpG|oAA^kAD#Kd1>Aq;d_4+dwSAuB6@hpcwV#Ky_h^79e^~ z0=doOTAhi*meCXm<=@x%QVB@5;T`u`tm8(h+3<{?nPO(Rk2kW3e0*Ie5z_ZICc$3V zc-lDUU9DTsag2aa#7%eea&FIOHYxcbI!|-Fgb=4+t%|UOWux$>WgTbmCk-`M-2Fti zPR(>6#o6h{WqZd~!-t0<(c>j9Z=W{?3A0SdcAeq1bZCf;Na&wiRlYXR7o|bTYKb^? zc;>yRB^&|2@B8tUim8pYQ4MtiRT1Lw>ig$(gL896`nT=OmnNOR_En zFuq-}_s5=5+j6@K5%h%$Z4R~yCcZSR4(2&(Wy-WY2W)bS=F$7X%d|biyiMxga|YAU z)D5eY|4qZlFl5g-$7`9}Ws;bj)nH(jr)(CBEgTe{U`fhoV5N9dp?_5!EqH1?H$v%J zUQG58fs~IHp%)P#ia@;}d@4SygD}iiCt+}3ny2s1-gBGe^S~?R<^?sNf_$>r|Owcc2*`r4cpo-uAZH-PS3&I4r^a_cp*X`c%3`% z`kLb6h5CYr#Y14<9HpXd%_{VeV8QwoLpF;dn}lko!1U$B$3htifm&l?Q)2a2`suSO zFEq~=PG6rr`&#T&k~hbs*2>L^{531&wCIY_M84VGqR7BQ-!!XZU;FieYmUEurWKlU zarQ)M>QH0g)hQka9(0gGfG@1HZhR~7lE@j8?I6lLh6mk$aN?dMCJF(lH0Xy9_rxDb z$nDB3l5A7McsqDuJqaFOyGn{!rH6JWYA_Xjbt9yXm?|3Uf-c zKp;H)0ZTvvLi~Va0XYa!=Sj-}*5uA80vd;PP{BLnFu<<4injxXs1}J`$Q?+}CzVOI z%JG3#r$iegy=;rk$n$WH5PTP0C4NN+Je$!g8rJz-$O#3dt zDCCgp{%fMqqSO20CBm3rD=$XeJF7#`Y1Y?T)7s*npf#$wL?Oj)sm0!mpyDmHE%4fkKYWK47DO zua$rJ!`8O9s)^2qLF9gHZNFxa_M+uy#^_H;5@Po0M;6ffOp9v+h`DM*j4;U0Io*L^ ztvgee#Jiq}RjOr`lZKU@OHzMw*!|Z&9Vu`aS*wu)k!VATa12u*9wW=Q8{koS}`>u`tv9jWfO{>;400 zz(CS~1XuK{6uok^#mUON_z*Kv;#Ch{cGI>WMEYeJ@9h7l|%} zQI14z#1aC2G#UDMdg3tdB*r9@4sqxt3({uy3-yrwwtF@+1uQ{CYJr1@tV||JI2;aG zTmQn8zizevY0Bh{^xw{PkK6#Mz2Qq}l5Kan-VseCZQ^)Ij7P@XVE3r-3muTw^KW$U z&p6coM~WEo^FXQxKzCsw=ZL{F-wXdeNvwNKk%^cjMy_th$d_q%9zL?EY-M4kCOnoS z8^Pn2xu(UVvbEe3J+ctlqL{QuACP@$$^T@gdAxYi^$lka0k^Jx0ZKq-PGN-p60dxk zz0txJr*g9^$L0RTZ$oC+O2BPU&32k>w}?qsM2u5T%+sEzPtISLC3Qi3PvWmAg`6JC=scpf zsg(TB#Y_g``zanQEB#|J|MLK7W`J=bW3#tuf_9HKf7&rgVP=Ruj{zXND~=dgU;qL@ zcAaP7;Xwc(yEIPfNeXk~{}}fC_+zhiu(Mm+$+#RE8yb2!G(0pk9!#$Z`VwviS4JqX zY101{UY-;})3yJ4?_suCzUw1#lm~>A!)y