@@ -11,6 +11,7 @@ import type {
11
11
import type {
12
12
PDFPageProxy ,
13
13
PDFPageView ,
14
+ PDFJSLibrary ,
14
15
PDFViewer ,
15
16
TextLayer ,
16
17
} from '../../types/pdfjs' ;
@@ -86,6 +87,15 @@ function getPDFViewer(): PDFViewer {
86
87
return PDFViewerApplication . pdfViewer ;
87
88
}
88
89
90
+ function getPDFJSLib ( ) : PDFJSLibrary {
91
+ // @ts -ignore - TS doesn't know about PDFViewerApplication global.
92
+ if ( typeof pdfjsLib !== 'object' ) {
93
+ throw new Error ( 'PDF.js library not available' ) ;
94
+ }
95
+ // @ts -ignore - TS doesn't know about PDFViewerApplication global.
96
+ return pdfjsLib ;
97
+ }
98
+
89
99
/**
90
100
* Returns the view into which a PDF page is drawn.
91
101
*
@@ -144,6 +154,17 @@ export async function documentHasText() {
144
154
return hasText ;
145
155
}
146
156
157
+ function normalizeUnicode ( str : string ) {
158
+ const normalize = getPDFJSLib ( ) . normalizeUnicode ;
159
+ if ( normalize ) {
160
+ return normalize ( str ) ;
161
+ } else {
162
+ // In older versions of PDF.js which don't have this API, both extracted
163
+ // text and the text layer are already normalized.
164
+ return str ;
165
+ }
166
+ }
167
+
147
168
/**
148
169
* Return the text of a given PDF page.
149
170
*
@@ -164,6 +185,10 @@ function getPageTextContent(pageIndex: number): Promise<string> {
164
185
const textContent = await pageView . pdfPage . getTextContent ( {
165
186
// Deprecated option, set for compatibility with older PDF.js releases.
166
187
normalizeWhitespace : true ,
188
+
189
+ // There is a `disableNormalization` option which defaults to false.
190
+ // The result is that `textContent` will be normalized as if passed to
191
+ // `normalizeUnicode(textContent)`.
167
192
} ) ;
168
193
return textContent . items . map ( it => it . str ) . join ( '' ) ;
169
194
} ;
@@ -275,6 +300,55 @@ export function isTextLayerRenderingDone(textLayer: TextLayer): boolean {
275
300
return textLayer . div . querySelector ( '.endOfContent' ) !== null ;
276
301
}
277
302
303
+ /**
304
+ * A cache of text layer elements which have been normalized.
305
+ *
306
+ * Text layer elements are assumed to be immutable, ie. their text content
307
+ * doesn't change after PDF.js initially renders them.
308
+ */
309
+ const normalizedTextLayers = new WeakSet < HTMLElement > ( ) ;
310
+
311
+ /**
312
+ * Apply Unicode normalization to the content of a text layer.
313
+ *
314
+ * The purpose of this function is to ensure that the content of the text layer
315
+ * matches that returned by PDF.js's text extraction APIs, except for
316
+ * differences in whitespace which are handled via {@link translateOffsets}.
317
+ * This allows for text annotations to be anchored using text extracted via
318
+ * PDF.js's text extraction APIs, then the anchored positions can be later used
319
+ * to locate the corresponding parts of the text layer.
320
+ *
321
+ * In older versions of PDF.js, the text extraction APIs and text layer had the
322
+ * same normalization applied to them. This changed, as described in
323
+ * https://github.com/hypothesis/client/issues/6784#issuecomment-2624697567. By
324
+ * applying the same normalization ourselves, we can support both old and new
325
+ * versions of PDF.js simultaneously.
326
+ */
327
+ function normalizeTextLayer ( textLayer : HTMLElement ) {
328
+ if ( normalizedTextLayers . has ( textLayer ) ) {
329
+ return ;
330
+ }
331
+
332
+ const nodeIter : NodeIterator = textLayer . ownerDocument ! . createNodeIterator (
333
+ textLayer ,
334
+ NodeFilter . SHOW_TEXT ,
335
+ ) ;
336
+ let currentNode ;
337
+ while ( ( currentNode = nodeIter . nextNode ( ) ) ) {
338
+ const text = currentNode as Text ;
339
+ const textContent = text . data ;
340
+ const normalizedText = normalizeUnicode ( textContent ) ;
341
+
342
+ // Only assign to `text.data` if the normalized text is different, to avoid
343
+ // potentially triggering unnecessary work inside the browser.
344
+ if ( normalizedText !== textContent ) {
345
+ text . data = normalizedText ;
346
+ }
347
+ }
348
+
349
+ normalizedTextLayers . add ( textLayer ) ;
350
+ }
351
+
278
352
/**
279
353
* Locate the DOM Range which a position selector refers to.
280
354
*
@@ -305,16 +379,18 @@ async function anchorByPosition(
305
379
) {
306
380
// The page has been rendered. Locate the position in the text layer.
307
381
//
308
- // We allow for differences in whitespace between the text returned by
309
- // `getPageTextContent` and the text layer content. Any other differences
310
- // will cause mis-anchoring.
382
+ // We allow for differences in whitespace and Unicode normalization between
383
+ // the text returned by `getPageTextContent` and the text layer content. Any
384
+ // other differences will cause mis-anchoring.
311
385
312
386
const root = page . textLayer . textLayerDiv ?? page . textLayer . div ;
313
387
if ( ! root ) {
314
388
/* istanbul ignore next */
315
389
throw new Error ( 'Unable to find PDF.js text layer root' ) ;
316
390
}
317
391
392
+ normalizeTextLayer ( root ) ;
393
+
318
394
const textLayerStr = root . textContent ! ;
319
395
320
396
const [ textLayerStart , textLayerEnd ] = translateOffsets (
0 commit comments