add pre, post process code

theblackcat102 · theblackcat102 · commit cf230ab20df0 · 2022-07-31T20:03:11.000+08:00
diff --git a/README.md b/README.md
@@ -2,11 +2,13 @@
 
 - The Huggingface's transformers versions for inference under low resource ( no GPU )
 
+
 ### Highlights
 
 * Light installation footprint
 
-* Blazing fast models ( all models use less than 10M parameters )
+* Blazing fast models ( majority models use less than 100M parameters )
 
+* Relies on [ONNX](https://onnxruntime.ai/) for weights inference (install size ~ 30MB) 
 
 
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -14,11 +14,12 @@ def test_hierarchical():
 
 
     logits = -np.ones((32, 644))
-    logits[:3, :27] =  np.random.random((3, 27))
+    logits[:3, :100] =  np.random.random((3, 100))
     logits[:3, :10] = np.ones((3, 10))
     outputs = {
         'hierarchical': sigmoid(logits)
     }
     output = postprocessing.forward(outputs, {})
     for zero_output in output[3:]:
-        assert len(zero_output) == 0
+        assert len(zero_output) == 0
+
diff --git a/tinytensor/postprocessing.py b/tinytensor/postprocessing.py
@@ -0,0 +1,28 @@
+import logging
+from tinytensor.postprocessing.text_classification import (
+    HierarchicalMultiClassification,
+    Classification,
+    TopkClassification
+)
+
+pipeline2class = {
+    'topk_classification': TopkClassification,
+    'classification': Classification,
+    'multi_hierarchical_classification': HierarchicalMultiClassification
+}
+
+def make_postprocessing(main_configuration):
+    processor = []
+    
+    pipeline = [ (int(idx), preprocess_config) for idx, preprocess_config in main_configuration['outputs']]
+    pipeline = sorted(pipeline)
+    for (order_id, config) in pipeline:
+        type_ = config['output']
+        potsprocess_cls = pipeline2class[type_]
+        processor.append(
+            potsprocess_cls(config)
+        )
+    
+    return processor
+
+
diff --git a/tinytensor/preprocessing.py b/tinytensor/preprocessing.py
@@ -1 +1,21 @@
 import logging
+from tinytensor.preprocessing.tokenizer import (
+    TextProcessing
+)
+
+pipeline2class = {
+    'text': TextProcessing
+}
+
+
+def make_preprocessing(main_configuration):
+
+    processor = []
+    for param_name, preprocess_config in main_configuration['inputs']:
+        preprocess_config['name'] = param_name
+        preprocess_type = preprocess_config['type']
+        preprocess_cls = pipeline2class[preprocess_type]
+        processor.append(preprocess_cls(preprocess_config))
+    
+    return processor
+
diff --git a/tinytensor/preprocessing/abstract.py b/tinytensor/preprocessing/abstract.py
@@ -0,0 +1,11 @@
+
+
+class Preprocessing():
+
+    def forward(self, inputs, model_inputs):
+        '''
+            inputs: Inputs 
+            chains: Other results from previous models
+        '''
+        raise NotImplemented()
+
diff --git a/tinytensor/preprocessing/tokenizer.py b/tinytensor/preprocessing/tokenizer.py
@@ -1,5 +1,26 @@
 import logging
+import numpy as np
 try:
-    import tokenizer
+    from tokenizers import Tokenizer
 except ImportError:
     logging.warning("Huggingface tokenizers not installed ")
+from tinytensor.preprocessing.abstract import Preprocessing
+
+
+class TextProcessing(Preprocessing):
+
+    def __init__(self, configuration) -> None:
+        self.tokenizer = Tokenizer.from_file(configuration['tokenizer_file'])
+        self.name = configuration["name"]
+
+    def forward(self, inputs, model_inputs):
+        text = inputs[self.name]
+        if isinstance(text, str):
+            text = [text]
+
+        tokens = np.array([ encode.ids \
+                for encode in self.stu_tokenizer.encode_batch(text) ])
+        model_inputs[self.name] = tokens
+        return model_inputs
+
+