StringNormalizer#

StringNormalizer - 10#

Version

  • name: StringNormalizer (GitHub)

  • domain: main

  • since_version: 10

  • function:

  • support_level: SupportType.COMMON

  • shape inference: True

This version of the operator has been available since version 10.

Summary

Attributes

  • case_change_action - STRING : string enum that cases output to be lowercased/uppercases/unchanged. Valid values are “LOWER”, “UPPER”, “NONE”. Default is “NONE”

  • is_case_sensitive - INT : Boolean. Whether the identification of stop words in X is case- sensitive. Default is false

  • locale - STRING : Environment dependent string that denotes the locale according to which output strings needs to be upper/lowercased.Default en_US or platform specific equivalent as decided by the implementation.

  • stopwords - STRINGS : List of stop words. If not set, no word would be removed from X.

Inputs

  • X (heterogeneous) - tensor(string):

Outputs

  • Y (heterogeneous) - tensor(string):

Examples

_nostopwords_nochangecase

import numpy as np
import onnx

input = np.array(["monday", "tuesday"]).astype(object)
output = input

# No stopwords. This is a NOOP
node = onnx.helper.make_node(
    "StringNormalizer",
    inputs=["x"],
    outputs=["y"],
    is_case_sensitive=1,
)
expect(
    node,
    inputs=[input],
    outputs=[output],
    name="test_strnormalizer_nostopwords_nochangecase",
)

_monday_casesensintive_nochangecase

import numpy as np
import onnx

input = np.array(["monday", "tuesday", "wednesday", "thursday"]).astype(object)
output = np.array(["tuesday", "wednesday", "thursday"]).astype(object)
stopwords = ["monday"]

node = onnx.helper.make_node(
    "StringNormalizer",
    inputs=["x"],
    outputs=["y"],
    is_case_sensitive=1,
    stopwords=stopwords,
)
expect(
    node,
    inputs=[input],
    outputs=[output],
    name="test_strnormalizer_export_monday_casesensintive_nochangecase",
)

_monday_casesensintive_lower

import numpy as np
import onnx

input = np.array(["monday", "tuesday", "wednesday", "thursday"]).astype(object)
output = np.array(["tuesday", "wednesday", "thursday"]).astype(object)
stopwords = ["monday"]

node = onnx.helper.make_node(
    "StringNormalizer",
    inputs=["x"],
    outputs=["y"],
    case_change_action="LOWER",
    is_case_sensitive=1,
    stopwords=stopwords,
)
expect(
    node,
    inputs=[input],
    outputs=[output],
    name="test_strnormalizer_export_monday_casesensintive_lower",
)

_monday_casesensintive_upper

import numpy as np
import onnx

input = np.array(["monday", "tuesday", "wednesday", "thursday"]).astype(object)
output = np.array(["TUESDAY", "WEDNESDAY", "THURSDAY"]).astype(object)
stopwords = ["monday"]

node = onnx.helper.make_node(
    "StringNormalizer",
    inputs=["x"],
    outputs=["y"],
    case_change_action="UPPER",
    is_case_sensitive=1,
    stopwords=stopwords,
)
expect(
    node,
    inputs=[input],
    outputs=[output],
    name="test_strnormalizer_export_monday_casesensintive_upper",
)

_monday_empty_output

import numpy as np
import onnx

input = np.array(["monday", "monday"]).astype(object)
output = np.array([""]).astype(object)
stopwords = ["monday"]

node = onnx.helper.make_node(
    "StringNormalizer",
    inputs=["x"],
    outputs=["y"],
    case_change_action="UPPER",
    is_case_sensitive=1,
    stopwords=stopwords,
)
expect(
    node,
    inputs=[input],
    outputs=[output],
    name="test_strnormalizer_export_monday_empty_output",
)

_monday_insensintive_upper_twodim

import numpy as np
import onnx

input = (
    np.array(
        ["Monday", "tuesday", "wednesday", "Monday", "tuesday", "wednesday"]
    )
    .astype(object)
    .reshape([1, 6])
)

# It does upper case cecedille, accented E
# and german umlaut but fails
# with german eszett
output = (
    np.array(["TUESDAY", "WEDNESDAY", "TUESDAY", "WEDNESDAY"])
    .astype(object)
    .reshape([1, 4])
)
stopwords = ["monday"]

node = onnx.helper.make_node(
    "StringNormalizer",
    inputs=["x"],
    outputs=["y"],
    case_change_action="UPPER",
    stopwords=stopwords,
)
expect(
    node,
    inputs=[input],
    outputs=[output],
    name="test_strnormalizer_export_monday_insensintive_upper_twodim",
)