Coverage for mlprodict/sklapi/onnx_tokenizer.py: 93%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

111 statements  

1# pylint: disable=E1101 

2""" 

3@file 

4@brief Wrapper tokenizrs implemented in :epkg:`onnxruntime-extensions`. 

5""" 

6from io import BytesIO 

7import base64 

8import numpy 

9from scipy.sparse import csr_matrix 

10from sklearn.base import BaseEstimator, TransformerMixin 

11from onnx import helper, TensorProto, load 

12from onnx.defs import onnx_opset_version 

13try: 

14 from onnxruntime_extensions import get_library_path 

15except ImportError: 

16 get_library_path = None 

17from mlprodict import __max_supported_opset__ 

18 

19 

20class TokenizerTransformerBase(BaseEstimator, TransformerMixin): 

21 """ 

22 Base class for @see cl SentencePieceTokenizerTransformer and 

23 @see cl GPT2TokenizerTransformer. 

24 """ 

25 

26 def __init__(self): 

27 BaseEstimator.__init__(self) 

28 TransformerMixin.__init__(self) 

29 from onnxruntime import InferenceSession, SessionOptions # delayed 

30 self._InferenceSession = InferenceSession 

31 self._SessionOptions = SessionOptions 

32 

33 def __getstate__(self): 

34 state = BaseEstimator.__getstate__(self) 

35 del state['sess_'] 

36 del state['_InferenceSession'] 

37 del state['_SessionOptions'] 

38 state['onnx_'] = state['onnx_'].SerializeToString() 

39 return state 

40 

41 def __setstate__(self, state): 

42 if get_library_path is None: 

43 raise ImportError( 

44 "onnxruntime_extensions is not installed.") 

45 from onnxruntime import InferenceSession, SessionOptions # delayed 

46 state['onnx_'] = load(BytesIO(state['onnx_'])) 

47 BaseEstimator.__setstate__(self, state) 

48 self._InferenceSession = InferenceSession 

49 self._SessionOptions = SessionOptions 

50 so = SessionOptions() 

51 so.register_custom_ops_library(get_library_path()) 

52 self.sess_ = InferenceSession(self.onnx_.SerializeToString(), so, 

53 providers=['CPUExecutionProvider']) 

54 return self 

55 

56 

57class SentencePieceTokenizerTransformer(TokenizerTransformerBase): 

58 """ 

59 Wraps `SentencePieceTokenizer 

60 <https://github.com/microsoft/onnxruntime-extensions/blob/ 

61 main/docs/custom_text_ops.md#sentencepiecetokenizer>`_ 

62 into a :epkg:`scikit-learn` transformer. 

63 

64 :param model: The sentencepiece model serialized proto as 

65 stored as a string 

66 :param nbest_size: tensor(int64) A scalar for sampling. 

67 `nbest_size = {0,1}`: no sampling is performed. 

68 (default) `nbest_size > 1`: samples from the nbest_size results. 

69 `nbest_size < 0`: assuming that nbest_size is infinite and 

70 samples from the all hypothesis (lattice) using 

71 forward-filtering-and-backward-sampling algorithm. 

72 :param alpha: tensor(float) A scalar for a smoothing parameter. 

73 Inverse temperature for probability rescaling. 

74 :param reverse: tensor(bool) Reverses the tokenized sequence. 

75 :param add_bos: tensor(bool) Add beginning of sentence token to the result. 

76 :param add_eos: tensor(bool) Add end of sentence token to the result 

77 When reverse=True beginning/end of sentence tokens are added 

78 after reversing 

79 :param opset: main opset to use 

80 

81 Method *fit* produces the following attributes: 

82 

83 * `onnx_`: onnx graph 

84 * `sess_`: :epkg:`InferenceSession` used to compute the inference 

85 """ 

86 

87 def __init__(self, model, nbest_size=1, alpha=0.5, reverse=False, 

88 add_bos=False, add_eos=False, opset=None): 

89 TokenizerTransformerBase.__init__(self) 

90 if isinstance(model, bytes): 

91 self.model_b64 = model 

92 else: 

93 ints = model.tolist() 

94 b64 = base64.b64encode(ints) 

95 self.model_b64 = b64 

96 self.nbest_size = nbest_size 

97 self.alpha = alpha 

98 self.reverse = reverse 

99 self.add_bos = add_bos 

100 self.add_eos = add_eos 

101 self.opset = opset 

102 if get_library_path is None: 

103 raise ImportError( 

104 "onnxruntime_extensions is not installed.") 

105 

106 def fit(self, X, y=None, sample_weight=None): 

107 """ 

108 The model is not trains this method is still needed to 

109 set the instance up and ready to transform. 

110 

111 :param X: array of strings 

112 :param y: unused 

113 :param sample_weight: unused 

114 :return: self 

115 """ 

116 self.onnx_ = self._create_model( 

117 self.model_b64, opset=self.opset) 

118 so = self._SessionOptions() 

119 so.register_custom_ops_library(get_library_path()) 

120 self.sess_ = self._InferenceSession(self.onnx_.SerializeToString(), so) 

121 return self 

122 

123 @staticmethod 

124 def _create_model(model_b64, domain='ai.onnx.contrib', opset=None): 

125 nodes = [] 

126 mkv = helper.make_tensor_value_info 

127 nodes.append(helper.make_node( 

128 'SentencepieceTokenizer', 

129 inputs=['inputs', 'nbest_size', 'alpha', 'add_bos', 'add_eos', 

130 'reverse'], 

131 outputs=['out0', 'out1'], 

132 model=model_b64, 

133 name='SentencepieceTokenizeOpName', 

134 domain='ai.onnx.contrib')) 

135 inputs = [ 

136 mkv('inputs', TensorProto.STRING, [None]), 

137 mkv('nbest_size', TensorProto.INT64, [None]), 

138 mkv('alpha', TensorProto.FLOAT, [None]), 

139 mkv('add_bos', TensorProto.BOOL, [None]), 

140 mkv('add_eos', TensorProto.BOOL, [None]), 

141 mkv('reverse', TensorProto.BOOL, [None])] 

142 graph = helper.make_graph( 

143 nodes, 'SentencePieceTokenizerTransformer', inputs, [ 

144 mkv('out0', TensorProto.INT32, [None]), 

145 mkv('out1', TensorProto.INT64, [None])]) 

146 if opset is None: 

147 opset = min(__max_supported_opset__, onnx_opset_version()) 

148 model = helper.make_model(graph, opset_imports=[ 

149 helper.make_operatorsetid('', opset)]) 

150 model.opset_import.extend([helper.make_operatorsetid(domain, 1)]) 

151 return model 

152 

153 def transform(self, X): 

154 """ 

155 Applies the tokenizers on an array of strings. 

156 

157 :param X: array to strings. 

158 :return: sparses matrix with n_features 

159 """ 

160 out0, out1 = self.sess_.run(['out0', 'out1'], 

161 {'inputs': X, 'nbest_size': self.nbest_size, 'alpha': self.alpha, 

162 'add_bos': self.add_bos, 'add_eos': self.add_eos, 

163 'reverse': self.reverse}) 

164 values = numpy.ones(out0.shape[0], dtype=numpy.float32) 

165 return csr_matrix((values, out0, out1)) 

166 

167 

168class GPT2TokenizerTransformer(TokenizerTransformerBase): 

169 """ 

170 Wraps `GPT2Tokenizer 

171 <https://github.com/microsoft/onnxruntime-extensions/blob/ 

172 main/docs/custom_text_ops.md#gpt2tokenizer>`_ 

173 into a :epkg:`scikit-learn` transformer. 

174 

175 :param vocab: The content of the vocabulary file, 

176 its format is same with hugging face. 

177 :param merges: The content of the merges file, 

178 its format is same with hugging face. 

179 :param padding_length: When the input is a set of query, 

180 the tokenized result is ragged tensor, so we need to pad 

181 the tensor to tidy tensor and the *padding_length* indicates 

182 the strategy of the padding. 

183 When the *padding_length* equals -1, we will pad the tensor 

184 to length of longest row. 

185 When the *padding_length* is more than 0, we will pad the tensor 

186 to the number of padding_length. 

187 :param opset: main opset to use 

188 

189 Method *fit* produces the following attributes: 

190 

191 * `onnx_`: onnx graph 

192 * `sess_`: :epkg:`InferenceSession` used to compute the inference 

193 """ 

194 

195 def __init__(self, vocab, merges, padding_length=-1, opset=None): 

196 TokenizerTransformerBase.__init__(self) 

197 self.vocab = vocab 

198 self.merges = merges 

199 self.padding_length = padding_length 

200 self.opset = opset 

201 if get_library_path is None: 

202 raise ImportError( 

203 "onnxruntime_extensions is not installed.") 

204 

205 def fit(self, X, y=None, sample_weight=None): 

206 """ 

207 The model is not trains this method is still needed to 

208 set the instance up and ready to transform. 

209 

210 :param X: array of strings 

211 :param y: unused 

212 :param sample_weight: unused 

213 :return: self 

214 """ 

215 self.onnx_ = self._create_model( 

216 self.vocab, self.merges, self.padding_length, opset=self.opset) 

217 so = self._SessionOptions() 

218 so.register_custom_ops_library(get_library_path()) 

219 self.sess_ = self._InferenceSession(self.onnx_.SerializeToString(), so) 

220 return self 

221 

222 @staticmethod 

223 def _create_model(vocab, merges, padding_length, 

224 domain='ai.onnx.contrib', opset=None): 

225 nodes = [] 

226 mkv = helper.make_tensor_value_info 

227 nodes.append(helper.make_node( 

228 'GPT2Tokenizer', 

229 inputs=['inputs'], 

230 outputs=['input_ids', 'attention_mask'], 

231 vocab=vocab, merges=merges, 

232 padding_length=padding_length, 

233 name='GPT2TokenizerName', 

234 domain='ai.onnx.contrib')) 

235 inputs = [mkv('inputs', TensorProto.STRING, [None])] 

236 graph = helper.make_graph( 

237 nodes, 'GPT2TokenizerTransformer', inputs, [ 

238 mkv('input_ids', TensorProto.INT64, [None, None]), 

239 mkv('attention_mask', TensorProto.INT64, [None, None])]) 

240 if opset is None: 

241 opset = min(__max_supported_opset__, onnx_opset_version()) 

242 model = helper.make_model( 

243 graph, opset_imports=[helper.make_operatorsetid('', opset)]) 

244 model.opset_import.extend([helper.make_operatorsetid(domain, 1)]) 

245 return model 

246 

247 def transform(self, X): 

248 """ 

249 Applies the tokenizers on an array of strings. 

250 

251 :param X: array to strings. 

252 :return: sparses matrix with n_features 

253 """ 

254 input_ids, _ = self.sess_.run( 

255 ['input_ids', 'attention_mask'], {'inputs': X}) 

256 idx = input_ids.ravel() 

257 values = numpy.ones(idx.shape[0], dtype=numpy.float32) 

258 rg = numpy.arange(input_ids.shape[0] + 1).astype(numpy.int64) 

259 rows = rg * input_ids.shape[1] 

260 return csr_matrix((values, idx, rows))