Apply Unicode normalization to text layer and extracted text

robertknight · robertknight · commit 77c03bc15f91 · 2025-05-20T17:15:58.000+01:00
Text quote anchoring in PDFs relies on the content of the rendered text layer matching the text produced by PDF.js's text extraction APIs, except for differences in whitespace which are handled by a `translateOffsets` helper. This assumption no longer holds in more recent versions of PDF.js because Unicode normalization is no longer applied to the text layer. Resolve the issue by applying normalization to the text layer ourselves. Part of #6784.
diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts
@@ -11,6 +11,7 @@ import type {
 import type {
   PDFPageProxy,
   PDFPageView,
+  PDFJSLibrary,
   PDFViewer,
   TextLayer,
 } from '../../types/pdfjs';
@@ -86,6 +87,15 @@ function getPDFViewer(): PDFViewer {
   return PDFViewerApplication.pdfViewer;
 }
 
+function getPDFJSLib(): PDFJSLibrary {
+  // @ts-ignore - TS doesn't know about PDFViewerApplication global.
+  if (typeof pdfjsLib !== 'object') {
+    throw new Error('PDF.js library not available');
+  }
+  // @ts-ignore - TS doesn't know about PDFViewerApplication global.
+  return pdfjsLib;
+}
+
 /**
  * Returns the view into which a PDF page is drawn.
  *
@@ -144,6 +154,17 @@ export async function documentHasText() {
   return hasText;
 }
 
+function normalizeUnicode(str: string) {
+  const normalize = getPDFJSLib().normalizeUnicode;
+  if (normalize) {
+    return normalize(str);
+  } else {
+    // In older versions of PDF.js which don't have this API, both extracted
+    // text and the text layer are already normalized.
+    return str;
+  }
+}
+
 /**
  * Return the text of a given PDF page.
  *
@@ -164,6 +185,10 @@ function getPageTextContent(pageIndex: number): Promise<string> {
     const textContent = await pageView.pdfPage.getTextContent({
       // Deprecated option, set for compatibility with older PDF.js releases.
       normalizeWhitespace: true,
+
+      // There is a `disableNormalization` option which defaults to false.
+      // The result is that `textContent` will be normalized as if passed to
+      // `normalizeUnicode(textContent)`.
     });
     return textContent.items.map(it => it.str).join('');
   };
@@ -275,6 +300,55 @@ export function isTextLayerRenderingDone(textLayer: TextLayer): boolean {
   return textLayer.div.querySelector('.endOfContent') !== null;
 }
 
+/**
+ * A cache of text layer elements which have been normalized.
+ *
+ * Text layer elements are assumed to be immutable, ie. their text content
+ * doesn't change after PDF.js initially renders them.
+ */
+const normalizedTextLayers = new WeakSet<HTMLElement>();
+
+/**
+ * Apply Unicode normalization to the content of a text layer.
+ *
+ * The purpose of this function is to ensure that the content of the text layer
+ * matches that returned by PDF.js's text extraction APIs, except for
+ * differences in whitespace which are handled via {@link translateOffsets}.
+ * This allows for text annotations to be anchored using text extracted via
+ * PDF.js's text extraction APIs, then the anchored positions can be later used
+ * to locate the corresponding parts of the text layer.
+ *
+ * In older versions of PDF.js, the text extraction APIs and text layer had the
+ * same normalization applied to them. This changed, as described in
+ * https://github.com/hypothesis/client/issues/6784#issuecomment-2624697567. By
+ * applying the same normalization ourselves, we can support both old and new
+ * versions of PDF.js simultaneously.
+ */
+function normalizeTextLayer(textLayer: HTMLElement) {
+  if (normalizedTextLayers.has(textLayer)) {
+    return;
+  }
+
+  const nodeIter: NodeIterator = textLayer.ownerDocument!.createNodeIterator(
+    textLayer,
+    NodeFilter.SHOW_TEXT,
+  );
+  let currentNode;
+  while ((currentNode = nodeIter.nextNode())) {
+    const text = currentNode as Text;
+    const textContent = text.data;
+    const normalizedText = normalizeUnicode(textContent);
+
+    // Only assign to `text.data` if the normalized text is different, to avoid
+    // potentially triggering unnecessary work inside the browser.
+    if (normalizedText !== textContent) {
+      text.data = normalizedText;
+    }
+  }
+
+  normalizedTextLayers.add(textLayer);
+}
+
 /**
  * Locate the DOM Range which a position selector refers to.
  *
@@ -305,16 +379,18 @@ async function anchorByPosition(
   ) {
     // The page has been rendered. Locate the position in the text layer.
     //
-    // We allow for differences in whitespace between the text returned by
-    // `getPageTextContent` and the text layer content. Any other differences
-    // will cause mis-anchoring.
+    // We allow for differences in whitespace and Unicode normalization between
+    // the text returned by `getPageTextContent` and the text layer content. Any
+    // other differences will cause mis-anchoring.
 
     const root = page.textLayer.textLayerDiv ?? page.textLayer.div;
     if (!root) {
       /* istanbul ignore next */
       throw new Error('Unable to find PDF.js text layer root');
     }
 
+    normalizeTextLayer(root);
+
     const textLayerStr = root.textContent!;
 
     const [textLayerStart, textLayerEnd] = translateOffsets(
diff --git a/src/types/pdfjs.ts b/src/types/pdfjs.ts
@@ -287,3 +287,23 @@ export type TextLayer = {
   /** Old name for root element of text layer. */
   textLayerDiv?: HTMLElement;
 };
+
+/**
+ * Type of the `pdfjsLib` global.
+ *
+ * See https://github.com/mozilla/pdf.js/blob/fc68a9f3ee3cfe5d9e80a260a157dc6ef28fbff8/src/pdf.js#L97.
+ */
+export type PDFJSLibrary = {
+  /**
+   * Apply PDF.js's custom Unicode normalization.
+   *
+   * This is effectively NFKC normalization, but only applied to a subset of
+   * characters.
+   *
+   * This function is not available in older versions of PDF.js (pre-2023).
+   * Those older versions always apply normalization to both the text layer and
+   * extracted text. Newer versions, which do have this API, normalize extracted
+   * text by default, but not the text layer.
+   */
+  normalizeUnicode?(str: string): string;
+};