Skip to content

Commit 77c03bc

Browse files
committed
Apply Unicode normalization to text layer and extracted text
Text quote anchoring in PDFs relies on the content of the rendered text layer matching the text produced by PDF.js's text extraction APIs, except for differences in whitespace which are handled by a `translateOffsets` helper. This assumption no longer holds in more recent versions of PDF.js because Unicode normalization is no longer applied to the text layer. Resolve the issue by applying normalization to the text layer ourselves. Part of #6784.
1 parent 069eba5 commit 77c03bc

File tree

2 files changed

+99
-3
lines changed

2 files changed

+99
-3
lines changed

src/annotator/anchoring/pdf.ts

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import type {
1111
import type {
1212
PDFPageProxy,
1313
PDFPageView,
14+
PDFJSLibrary,
1415
PDFViewer,
1516
TextLayer,
1617
} from '../../types/pdfjs';
@@ -86,6 +87,15 @@ function getPDFViewer(): PDFViewer {
8687
return PDFViewerApplication.pdfViewer;
8788
}
8889

90+
function getPDFJSLib(): PDFJSLibrary {
91+
// @ts-ignore - TS doesn't know about PDFViewerApplication global.
92+
if (typeof pdfjsLib !== 'object') {
93+
throw new Error('PDF.js library not available');
94+
}
95+
// @ts-ignore - TS doesn't know about PDFViewerApplication global.
96+
return pdfjsLib;
97+
}
98+
8999
/**
90100
* Returns the view into which a PDF page is drawn.
91101
*
@@ -144,6 +154,17 @@ export async function documentHasText() {
144154
return hasText;
145155
}
146156

157+
function normalizeUnicode(str: string) {
158+
const normalize = getPDFJSLib().normalizeUnicode;
159+
if (normalize) {
160+
return normalize(str);
161+
} else {
162+
// In older versions of PDF.js which don't have this API, both extracted
163+
// text and the text layer are already normalized.
164+
return str;
165+
}
166+
}
167+
147168
/**
148169
* Return the text of a given PDF page.
149170
*
@@ -164,6 +185,10 @@ function getPageTextContent(pageIndex: number): Promise<string> {
164185
const textContent = await pageView.pdfPage.getTextContent({
165186
// Deprecated option, set for compatibility with older PDF.js releases.
166187
normalizeWhitespace: true,
188+
189+
// There is a `disableNormalization` option which defaults to false.
190+
// The result is that `textContent` will be normalized as if passed to
191+
// `normalizeUnicode(textContent)`.
167192
});
168193
return textContent.items.map(it => it.str).join('');
169194
};
@@ -275,6 +300,55 @@ export function isTextLayerRenderingDone(textLayer: TextLayer): boolean {
275300
return textLayer.div.querySelector('.endOfContent') !== null;
276301
}
277302

303+
/**
304+
* A cache of text layer elements which have been normalized.
305+
*
306+
* Text layer elements are assumed to be immutable, ie. their text content
307+
* doesn't change after PDF.js initially renders them.
308+
*/
309+
const normalizedTextLayers = new WeakSet<HTMLElement>();
310+
311+
/**
312+
* Apply Unicode normalization to the content of a text layer.
313+
*
314+
* The purpose of this function is to ensure that the content of the text layer
315+
* matches that returned by PDF.js's text extraction APIs, except for
316+
* differences in whitespace which are handled via {@link translateOffsets}.
317+
* This allows for text annotations to be anchored using text extracted via
318+
* PDF.js's text extraction APIs, then the anchored positions can be later used
319+
* to locate the corresponding parts of the text layer.
320+
*
321+
* In older versions of PDF.js, the text extraction APIs and text layer had the
322+
* same normalization applied to them. This changed, as described in
323+
* https://github.com/hypothesis/client/issues/6784#issuecomment-2624697567. By
324+
* applying the same normalization ourselves, we can support both old and new
325+
* versions of PDF.js simultaneously.
326+
*/
327+
function normalizeTextLayer(textLayer: HTMLElement) {
328+
if (normalizedTextLayers.has(textLayer)) {
329+
return;
330+
}
331+
332+
const nodeIter: NodeIterator = textLayer.ownerDocument!.createNodeIterator(
333+
textLayer,
334+
NodeFilter.SHOW_TEXT,
335+
);
336+
let currentNode;
337+
while ((currentNode = nodeIter.nextNode())) {
338+
const text = currentNode as Text;
339+
const textContent = text.data;
340+
const normalizedText = normalizeUnicode(textContent);
341+
342+
// Only assign to `text.data` if the normalized text is different, to avoid
343+
// potentially triggering unnecessary work inside the browser.
344+
if (normalizedText !== textContent) {
345+
text.data = normalizedText;
346+
}
347+
}
348+
349+
normalizedTextLayers.add(textLayer);
350+
}
351+
278352
/**
279353
* Locate the DOM Range which a position selector refers to.
280354
*
@@ -305,16 +379,18 @@ async function anchorByPosition(
305379
) {
306380
// The page has been rendered. Locate the position in the text layer.
307381
//
308-
// We allow for differences in whitespace between the text returned by
309-
// `getPageTextContent` and the text layer content. Any other differences
310-
// will cause mis-anchoring.
382+
// We allow for differences in whitespace and Unicode normalization between
383+
// the text returned by `getPageTextContent` and the text layer content. Any
384+
// other differences will cause mis-anchoring.
311385

312386
const root = page.textLayer.textLayerDiv ?? page.textLayer.div;
313387
if (!root) {
314388
/* istanbul ignore next */
315389
throw new Error('Unable to find PDF.js text layer root');
316390
}
317391

392+
normalizeTextLayer(root);
393+
318394
const textLayerStr = root.textContent!;
319395

320396
const [textLayerStart, textLayerEnd] = translateOffsets(

src/types/pdfjs.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,3 +287,23 @@ export type TextLayer = {
287287
/** Old name for root element of text layer. */
288288
textLayerDiv?: HTMLElement;
289289
};
290+
291+
/**
292+
* Type of the `pdfjsLib` global.
293+
*
294+
* See https://github.com/mozilla/pdf.js/blob/fc68a9f3ee3cfe5d9e80a260a157dc6ef28fbff8/src/pdf.js#L97.
295+
*/
296+
export type PDFJSLibrary = {
297+
/**
298+
* Apply PDF.js's custom Unicode normalization.
299+
*
300+
* This is effectively NFKC normalization, but only applied to a subset of
301+
* characters.
302+
*
303+
* This function is not available in older versions of PDF.js (pre-2023).
304+
* Those older versions always apply normalization to both the text layer and
305+
* extracted text. Newer versions, which do have this API, normalize extracted
306+
* text by default, but not the text layer.
307+
*/
308+
normalizeUnicode?(str: string): string;
309+
};

0 commit comments

Comments
 (0)