Skip to content

Commit cf230ab

Browse files
author
theblackcat102
committed
add pre, post process code
1 parent 7bc3e1b commit cf230ab

File tree

6 files changed

+87
-4
lines changed

6 files changed

+87
-4
lines changed

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
- The Huggingface's transformers versions for inference under low resource ( no GPU )
44

5+
56
### Highlights
67

78
* Light installation footprint
89

9-
* Blazing fast models ( all models use less than 10M parameters )
10+
* Blazing fast models ( majority models use less than 100M parameters )
1011

12+
* Relies on [ONNX](https://onnxruntime.ai/) for weights inference (install size ~ 30MB)
1113

1214

tests/test_preprocessing.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@ def test_hierarchical():
1414

1515

1616
logits = -np.ones((32, 644))
17-
logits[:3, :27] = np.random.random((3, 27))
17+
logits[:3, :100] = np.random.random((3, 100))
1818
logits[:3, :10] = np.ones((3, 10))
1919
outputs = {
2020
'hierarchical': sigmoid(logits)
2121
}
2222
output = postprocessing.forward(outputs, {})
2323
for zero_output in output[3:]:
24-
assert len(zero_output) == 0
24+
assert len(zero_output) == 0
25+

tinytensor/postprocessing.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import logging
2+
from tinytensor.postprocessing.text_classification import (
3+
HierarchicalMultiClassification,
4+
Classification,
5+
TopkClassification
6+
)
7+
8+
pipeline2class = {
9+
'topk_classification': TopkClassification,
10+
'classification': Classification,
11+
'multi_hierarchical_classification': HierarchicalMultiClassification
12+
}
13+
14+
def make_postprocessing(main_configuration):
15+
processor = []
16+
17+
pipeline = [ (int(idx), preprocess_config) for idx, preprocess_config in main_configuration['outputs']]
18+
pipeline = sorted(pipeline)
19+
for (order_id, config) in pipeline:
20+
type_ = config['output']
21+
potsprocess_cls = pipeline2class[type_]
22+
processor.append(
23+
potsprocess_cls(config)
24+
)
25+
26+
return processor
27+
28+

tinytensor/preprocessing.py

+20
Original file line numberDiff line numberDiff line change
@@ -1 +1,21 @@
11
import logging
2+
from tinytensor.preprocessing.tokenizer import (
3+
TextProcessing
4+
)
5+
6+
pipeline2class = {
7+
'text': TextProcessing
8+
}
9+
10+
11+
def make_preprocessing(main_configuration):
12+
13+
processor = []
14+
for param_name, preprocess_config in main_configuration['inputs']:
15+
preprocess_config['name'] = param_name
16+
preprocess_type = preprocess_config['type']
17+
preprocess_cls = pipeline2class[preprocess_type]
18+
processor.append(preprocess_cls(preprocess_config))
19+
20+
return processor
21+

tinytensor/preprocessing/abstract.py

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
3+
class Preprocessing():
4+
5+
def forward(self, inputs, model_inputs):
6+
'''
7+
inputs: Inputs
8+
chains: Other results from previous models
9+
'''
10+
raise NotImplemented()
11+

tinytensor/preprocessing/tokenizer.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,26 @@
11
import logging
2+
import numpy as np
23
try:
3-
import tokenizer
4+
from tokenizers import Tokenizer
45
except ImportError:
56
logging.warning("Huggingface tokenizers not installed ")
7+
from tinytensor.preprocessing.abstract import Preprocessing
8+
9+
10+
class TextProcessing(Preprocessing):
11+
12+
def __init__(self, configuration) -> None:
13+
self.tokenizer = Tokenizer.from_file(configuration['tokenizer_file'])
14+
self.name = configuration["name"]
15+
16+
def forward(self, inputs, model_inputs):
17+
text = inputs[self.name]
18+
if isinstance(text, str):
19+
text = [text]
20+
21+
tokens = np.array([ encode.ids \
22+
for encode in self.stu_tokenizer.encode_batch(text) ])
23+
model_inputs[self.name] = tokens
24+
return model_inputs
25+
26+

0 commit comments

Comments
 (0)