diff --git a/README.md b/README.md index 2dd668278..ffff3fa26 100644 --- a/README.md +++ b/README.md @@ -407,6 +407,7 @@ You can refine your search by selecting the task you're interested in (e.g., [te 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[TextNet](https://huggingface.co/docs/transformers/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet index 7d3bdf75d..8fad32304 100644 --- a/docs/snippets/6_supported-models.snippet +++ b/docs/snippets/6_supported-models.snippet @@ -122,6 +122,7 @@ 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham. +1. **[TextNet](https://huggingface.co/docs/transformers/model_doc/textnet)** released with the paper [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei. 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang. 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu. diff --git a/src/models.js b/src/models.js index f6828391b..4551d4d43 100644 --- a/src/models.js +++ b/src/models.js @@ -4712,6 +4712,18 @@ export class ViTForImageClassification extends ViTPreTrainedModel { } ////////////////////////////////////////////////// +////////////////////////////////////////////////// +export class TextNetPreTrainedModel extends PreTrainedModel { } +export class TextNetModel extends TextNetPreTrainedModel { } +export class TextNetForImageClassification extends TextNetPreTrainedModel { + /** + * @param {any} model_inputs + */ + async _call(model_inputs) { + return new SequenceClassifierOutput(await super._call(model_inputs)); + } +} +////////////////////////////////////////////////// ////////////////////////////////////////////////// export class IJepaPreTrainedModel extends PreTrainedModel { } @@ -7002,6 +7014,7 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([ ['rt_detr', ['RTDetrModel', RTDetrModel]], ['table-transformer', ['TableTransformerModel', TableTransformerModel]], ['vit', ['ViTModel', ViTModel]], + ['textnet', ['TextNetModel', TextNetModel]], ['ijepa', ['IJepaModel', IJepaModel]], ['pvt', ['PvtModel', PvtModel]], ['vit_msn', ['ViTMSNModel', ViTMSNModel]], @@ -7251,6 +7264,7 @@ const MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = new Map([ const MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = new Map([ ['vit', ['ViTForImageClassification', ViTForImageClassification]], + ['textnet', ['TextNetForImageClassification', TextNetForImageClassification]], ['ijepa', ['IJepaForImageClassification', IJepaForImageClassification]], ['pvt', ['PvtForImageClassification', PvtForImageClassification]], ['vit_msn', ['ViTMSNForImageClassification', ViTMSNForImageClassification]], diff --git a/src/models/image_processors.js b/src/models/image_processors.js index fd002c81c..03100a5a8 100644 --- a/src/models/image_processors.js +++ b/src/models/image_processors.js @@ -32,6 +32,7 @@ export * from './sam/image_processing_sam.js' export * from './segformer/image_processing_segformer.js' export * from './siglip/image_processing_siglip.js' export * from './swin2sr/image_processing_swin2sr.js' +export * from './textnet/image_processing_textnet.js' export * from './vit/image_processing_vit.js' export * from './vitmatte/image_processing_vitmatte.js' export * from './vitpose/image_processing_vitpose.js' diff --git a/src/models/textnet/image_processing_textnet.js b/src/models/textnet/image_processing_textnet.js new file mode 100644 index 000000000..49450bbf5 --- /dev/null +++ b/src/models/textnet/image_processing_textnet.js @@ -0,0 +1,6 @@ +import { + ImageProcessor, +} from "../../base/image_processors_utils.js"; + +export class TextNetImageProcessor extends ImageProcessor { } + diff --git a/tests/models/textnet/test_image_processing_textnet.js b/tests/models/textnet/test_image_processing_textnet.js new file mode 100644 index 000000000..99f126a4b --- /dev/null +++ b/tests/models/textnet/test_image_processing_textnet.js @@ -0,0 +1,31 @@ +import { AutoImageProcessor, TextNetImageProcessor } from "../../../src/transformers.js"; + +import { load_cached_image } from "../../asset_cache.js"; +import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js"; + +export default () => { + describe("TextNetImageProcessor", () => { + const model_id = "onnx-community/textnet-tiny"; + + /** @type {TextNetImageProcessor} */ + let processor; + beforeAll(async () => { + processor = await AutoImageProcessor.from_pretrained(model_id); + }, MAX_PROCESSOR_LOAD_TIME); + + it( + "default", + async () => { + const image = await load_cached_image("receipt"); + const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image); + + expect(pixel_values.dims).toEqual([1, 3, 960, 640]); + expect(pixel_values.mean().item()).toBeCloseTo(0.8106788992881775, 6); + + expect(original_sizes).toEqual([[864, 576]]); + expect(reshaped_input_sizes).toEqual([[960, 640]]); + }, + MAX_TEST_EXECUTION_TIME, + ); + }); +};