Coverage for mlprodict/sklapi/onnx_tokenizer.py: 93%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# pylint: disable=E1101
2"""
3@file
4@brief Wrapper tokenizrs implemented in :epkg:`onnxruntime-extensions`.
5"""
6from io import BytesIO
7import base64
8import numpy
9from scipy.sparse import csr_matrix
10from sklearn.base import BaseEstimator, TransformerMixin
11from onnx import helper, TensorProto, load
12from onnx.defs import onnx_opset_version
13try:
14 from onnxruntime_extensions import get_library_path
15except ImportError:
16 get_library_path = None
17from mlprodict import __max_supported_opset__
20class TokenizerTransformerBase(BaseEstimator, TransformerMixin):
21 """
22 Base class for @see cl SentencePieceTokenizerTransformer and
23 @see cl GPT2TokenizerTransformer.
24 """
26 def __init__(self):
27 BaseEstimator.__init__(self)
28 TransformerMixin.__init__(self)
29 from onnxruntime import InferenceSession, SessionOptions # delayed
30 self._InferenceSession = InferenceSession
31 self._SessionOptions = SessionOptions
33 def __getstate__(self):
34 state = BaseEstimator.__getstate__(self)
35 del state['sess_']
36 del state['_InferenceSession']
37 del state['_SessionOptions']
38 state['onnx_'] = state['onnx_'].SerializeToString()
39 return state
41 def __setstate__(self, state):
42 if get_library_path is None:
43 raise ImportError(
44 "onnxruntime_extensions is not installed.")
45 from onnxruntime import InferenceSession, SessionOptions # delayed
46 state['onnx_'] = load(BytesIO(state['onnx_']))
47 BaseEstimator.__setstate__(self, state)
48 self._InferenceSession = InferenceSession
49 self._SessionOptions = SessionOptions
50 so = SessionOptions()
51 so.register_custom_ops_library(get_library_path())
52 self.sess_ = InferenceSession(self.onnx_.SerializeToString(), so,
53 providers=['CPUExecutionProvider'])
54 return self
57class SentencePieceTokenizerTransformer(TokenizerTransformerBase):
58 """
59 Wraps `SentencePieceTokenizer
60 <https://github.com/microsoft/onnxruntime-extensions/blob/
61 main/docs/custom_text_ops.md#sentencepiecetokenizer>`_
62 into a :epkg:`scikit-learn` transformer.
64 :param model: The sentencepiece model serialized proto as
65 stored as a string
66 :param nbest_size: tensor(int64) A scalar for sampling.
67 `nbest_size = {0,1}`: no sampling is performed.
68 (default) `nbest_size > 1`: samples from the nbest_size results.
69 `nbest_size < 0`: assuming that nbest_size is infinite and
70 samples from the all hypothesis (lattice) using
71 forward-filtering-and-backward-sampling algorithm.
72 :param alpha: tensor(float) A scalar for a smoothing parameter.
73 Inverse temperature for probability rescaling.
74 :param reverse: tensor(bool) Reverses the tokenized sequence.
75 :param add_bos: tensor(bool) Add beginning of sentence token to the result.
76 :param add_eos: tensor(bool) Add end of sentence token to the result
77 When reverse=True beginning/end of sentence tokens are added
78 after reversing
79 :param opset: main opset to use
81 Method *fit* produces the following attributes:
83 * `onnx_`: onnx graph
84 * `sess_`: :epkg:`InferenceSession` used to compute the inference
85 """
87 def __init__(self, model, nbest_size=1, alpha=0.5, reverse=False,
88 add_bos=False, add_eos=False, opset=None):
89 TokenizerTransformerBase.__init__(self)
90 if isinstance(model, bytes):
91 self.model_b64 = model
92 else:
93 ints = model.tolist()
94 b64 = base64.b64encode(ints)
95 self.model_b64 = b64
96 self.nbest_size = nbest_size
97 self.alpha = alpha
98 self.reverse = reverse
99 self.add_bos = add_bos
100 self.add_eos = add_eos
101 self.opset = opset
102 if get_library_path is None:
103 raise ImportError(
104 "onnxruntime_extensions is not installed.")
106 def fit(self, X, y=None, sample_weight=None):
107 """
108 The model is not trains this method is still needed to
109 set the instance up and ready to transform.
111 :param X: array of strings
112 :param y: unused
113 :param sample_weight: unused
114 :return: self
115 """
116 self.onnx_ = self._create_model(
117 self.model_b64, opset=self.opset)
118 so = self._SessionOptions()
119 so.register_custom_ops_library(get_library_path())
120 self.sess_ = self._InferenceSession(self.onnx_.SerializeToString(), so)
121 return self
123 @staticmethod
124 def _create_model(model_b64, domain='ai.onnx.contrib', opset=None):
125 nodes = []
126 mkv = helper.make_tensor_value_info
127 nodes.append(helper.make_node(
128 'SentencepieceTokenizer',
129 inputs=['inputs', 'nbest_size', 'alpha', 'add_bos', 'add_eos',
130 'reverse'],
131 outputs=['out0', 'out1'],
132 model=model_b64,
133 name='SentencepieceTokenizeOpName',
134 domain='ai.onnx.contrib'))
135 inputs = [
136 mkv('inputs', TensorProto.STRING, [None]),
137 mkv('nbest_size', TensorProto.INT64, [None]),
138 mkv('alpha', TensorProto.FLOAT, [None]),
139 mkv('add_bos', TensorProto.BOOL, [None]),
140 mkv('add_eos', TensorProto.BOOL, [None]),
141 mkv('reverse', TensorProto.BOOL, [None])]
142 graph = helper.make_graph(
143 nodes, 'SentencePieceTokenizerTransformer', inputs, [
144 mkv('out0', TensorProto.INT32, [None]),
145 mkv('out1', TensorProto.INT64, [None])])
146 if opset is None:
147 opset = min(__max_supported_opset__, onnx_opset_version())
148 model = helper.make_model(graph, opset_imports=[
149 helper.make_operatorsetid('', opset)])
150 model.opset_import.extend([helper.make_operatorsetid(domain, 1)])
151 return model
153 def transform(self, X):
154 """
155 Applies the tokenizers on an array of strings.
157 :param X: array to strings.
158 :return: sparses matrix with n_features
159 """
160 out0, out1 = self.sess_.run(['out0', 'out1'],
161 {'inputs': X, 'nbest_size': self.nbest_size, 'alpha': self.alpha,
162 'add_bos': self.add_bos, 'add_eos': self.add_eos,
163 'reverse': self.reverse})
164 values = numpy.ones(out0.shape[0], dtype=numpy.float32)
165 return csr_matrix((values, out0, out1))
168class GPT2TokenizerTransformer(TokenizerTransformerBase):
169 """
170 Wraps `GPT2Tokenizer
171 <https://github.com/microsoft/onnxruntime-extensions/blob/
172 main/docs/custom_text_ops.md#gpt2tokenizer>`_
173 into a :epkg:`scikit-learn` transformer.
175 :param vocab: The content of the vocabulary file,
176 its format is same with hugging face.
177 :param merges: The content of the merges file,
178 its format is same with hugging face.
179 :param padding_length: When the input is a set of query,
180 the tokenized result is ragged tensor, so we need to pad
181 the tensor to tidy tensor and the *padding_length* indicates
182 the strategy of the padding.
183 When the *padding_length* equals -1, we will pad the tensor
184 to length of longest row.
185 When the *padding_length* is more than 0, we will pad the tensor
186 to the number of padding_length.
187 :param opset: main opset to use
189 Method *fit* produces the following attributes:
191 * `onnx_`: onnx graph
192 * `sess_`: :epkg:`InferenceSession` used to compute the inference
193 """
195 def __init__(self, vocab, merges, padding_length=-1, opset=None):
196 TokenizerTransformerBase.__init__(self)
197 self.vocab = vocab
198 self.merges = merges
199 self.padding_length = padding_length
200 self.opset = opset
201 if get_library_path is None:
202 raise ImportError(
203 "onnxruntime_extensions is not installed.")
205 def fit(self, X, y=None, sample_weight=None):
206 """
207 The model is not trains this method is still needed to
208 set the instance up and ready to transform.
210 :param X: array of strings
211 :param y: unused
212 :param sample_weight: unused
213 :return: self
214 """
215 self.onnx_ = self._create_model(
216 self.vocab, self.merges, self.padding_length, opset=self.opset)
217 so = self._SessionOptions()
218 so.register_custom_ops_library(get_library_path())
219 self.sess_ = self._InferenceSession(self.onnx_.SerializeToString(), so)
220 return self
222 @staticmethod
223 def _create_model(vocab, merges, padding_length,
224 domain='ai.onnx.contrib', opset=None):
225 nodes = []
226 mkv = helper.make_tensor_value_info
227 nodes.append(helper.make_node(
228 'GPT2Tokenizer',
229 inputs=['inputs'],
230 outputs=['input_ids', 'attention_mask'],
231 vocab=vocab, merges=merges,
232 padding_length=padding_length,
233 name='GPT2TokenizerName',
234 domain='ai.onnx.contrib'))
235 inputs = [mkv('inputs', TensorProto.STRING, [None])]
236 graph = helper.make_graph(
237 nodes, 'GPT2TokenizerTransformer', inputs, [
238 mkv('input_ids', TensorProto.INT64, [None, None]),
239 mkv('attention_mask', TensorProto.INT64, [None, None])])
240 if opset is None:
241 opset = min(__max_supported_opset__, onnx_opset_version())
242 model = helper.make_model(
243 graph, opset_imports=[helper.make_operatorsetid('', opset)])
244 model.opset_import.extend([helper.make_operatorsetid(domain, 1)])
245 return model
247 def transform(self, X):
248 """
249 Applies the tokenizers on an array of strings.
251 :param X: array to strings.
252 :return: sparses matrix with n_features
253 """
254 input_ids, _ = self.sess_.run(
255 ['input_ids', 'attention_mask'], {'inputs': X})
256 idx = input_ids.ravel()
257 values = numpy.ones(idx.shape[0], dtype=numpy.float32)
258 rg = numpy.arange(input_ids.shape[0] + 1).astype(numpy.int64)
259 rows = rg * input_ids.shape[1]
260 return csr_matrix((values, idx, rows))