Files
file	byte_pair_encoding.hpp

file	subword_tokenize.hpp

file	tokenize.hpp

Classes
struct	nvtext::bpe_merge_pairs
	The table of merge pairs for the BPE encoder. More...

struct	nvtext::hashed_vocabulary
	The vocabulary data for use with the subword_tokenize function. More...

struct	nvtext::tokenizer_result
	Result object for the subword_tokenize functions. More...

struct	nvtext::tokenize_vocabulary
	Vocabulary object to be used with nvtext::tokenize_with_vocabulary. More...

Functions
std::unique_ptr< bpe_merge_pairs >	nvtext::load_merge_pairs_file (std::string const &filename_merges, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Create a nvtext::bpe_merge_pairs from an input file. More...

std::unique_ptr< bpe_merge_pairs >	nvtext::load_merge_pairs (cudf::strings_column_view const &merge_pairs, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Create a nvtext::bpe_merge_pairs from a strings column. More...

std::unique_ptr< cudf::column >	nvtext::byte_pair_encoding (cudf::strings_column_view const &input, bpe_merge_pairs const &merges_pairs, cudf::string_scalar const &separator=cudf::string_scalar(" "), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Byte pair encode the input strings. More...

std::unique_ptr< hashed_vocabulary >	nvtext::load_vocabulary_file (std::string const &filename_hashed_vocabulary, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Load the hashed vocabulary file into device memory. More...

tokenizer_result	nvtext::subword_tokenize (cudf::strings_column_view const &strings, hashed_vocabulary const &vocabulary_table, uint32_t max_sequence_length, uint32_t stride, bool do_lower_case, bool do_truncate, rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Creates a tokenizer that cleans the text, splits it into tokens and returns token-ids from an input vocabulary. More...

std::unique_ptr< cudf::column >	nvtext::tokenize (cudf::strings_column_view const &input, cudf::string_scalar const &delimiter=cudf::string_scalar{""}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Returns a single column of strings by tokenizing the input strings column using the provided characters as delimiters. More...

std::unique_ptr< cudf::column >	nvtext::tokenize (cudf::strings_column_view const &input, cudf::strings_column_view const &delimiters, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Returns a single column of strings by tokenizing the input strings column using multiple strings as delimiters. More...

std::unique_ptr< cudf::column >	nvtext::count_tokens (cudf::strings_column_view const &input, cudf::string_scalar const &delimiter=cudf::string_scalar{""}, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Returns the number of tokens in each string of a strings column. More...

std::unique_ptr< cudf::column >	nvtext::count_tokens (cudf::strings_column_view const &input, cudf::strings_column_view const &delimiters, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Returns the number of tokens in each string of a strings column by using multiple strings delimiters to identify tokens in each string. More...

std::unique_ptr< cudf::column >	nvtext::character_tokenize (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Returns a single column of strings by converting each character to a string. More...

std::unique_ptr< cudf::column >	nvtext::detokenize (cudf::strings_column_view const &input, cudf::column_view const &row_indices, cudf::string_scalar const &separator=cudf::string_scalar(" "), rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Creates a strings column from a strings column of tokens and an associated column of row ids. More...

std::unique_ptr< tokenize_vocabulary >	nvtext::load_vocabulary (cudf::strings_column_view const &input, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Create a tokenize_vocabulary object from a strings column. More...

std::unique_ptr< cudf::column >	nvtext::tokenize_with_vocabulary (cudf::strings_column_view const &input, tokenize_vocabulary const &vocabulary, cudf::string_scalar const &delimiter, cudf::size_type default_id=-1, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::mr::device_memory_resource *mr=rmm::mr::get_current_device_resource())
	Returns the token ids for the input string by looking up each delimited token in the given vocabulary. More...

Detailed Description

Function Documentation

◆ byte_pair_encoding()

std::unique_ptr<cudf::column> nvtext::byte_pair_encoding	(	cudf::strings_column_view const &	input,
		bpe_merge_pairs const &	merges_pairs,
		cudf::string_scalar const &	separator = `cudf::string_scalar(" ")`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Byte pair encode the input strings.

This will split each string on whitespace, perform the encoding, and then build the output column using the given separator.

The encoding algorithm rebuilds each string by matching substrings in the merge_pairs table and iteratively removing the minimum ranked pair until no pairs are left. Then, a space is inserted between the remaining pairs before the result is joined to make the output string.

merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
mps = load_merge_pairs(merge_pairs)
input = ["test sentence", "thisis test"]
result = byte_pair_encoding(input, mps)
result is now ["test sent ence", "this is test"]

Exceptions

cudf::logic_error	if `merge_pairs` is empty
cudf::logic_error	if `separator` is invalid

Parameters

input	Strings to encode.
merges_pairs	Created by a call to nvtext::load_merge_pairs.
separator	String used to build the output after encoding. Default is a space.
mr	Memory resource to allocate any returned objects.

Returns: An encoded column of strings.

◆ character_tokenize()

std::unique_ptr<cudf::column> nvtext::character_tokenize	(	cudf::strings_column_view const &	input,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Returns a single column of strings by converting each character to a string.

Each string is converted to multiple strings – one for each character. Note that a character maybe more than one byte.

Example:
s = ["hello world", null, "goodbye"]
t = character_tokenize(s)
t is now ["h","e","l","l","o"," ","w","o","r","l","d","g","o","o","d","b","y","e"]

All null row entries are ignored and the output contains all valid rows.

Parameters

input	Strings column to tokenize
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New strings columns of tokens

◆ count_tokens() [1/2]

std::unique_ptr<cudf::column> nvtext::count_tokens	(	cudf::strings_column_view const &	input,
		cudf::string_scalar const &	delimiter = `cudf::string_scalar{""}`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Returns the number of tokens in each string of a strings column.

The delimiter may be zero or more characters. If the delimiter is empty, whitespace (character code-point <= ' ') is used for identifying tokens. Also, any consecutive delimiters found in a string are ignored. This means that only empty strings or null rows will result in a token count of 0.

Example:
s = ["a", "b c", " ", "d e f"]
t = count_tokens(s)
t is now [1, 2, 0, 3]

All null row entries are ignored and the output contains all valid rows. The number of tokens for a null element is set to 0 in the output column.

Parameters

input	Strings column to count tokens
delimiter	Strings used to separate each string into tokens. The default of empty string will separate tokens using whitespace.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New column of token counts

◆ count_tokens() [2/2]

std::unique_ptr<cudf::column> nvtext::count_tokens	(	cudf::strings_column_view const &	input,
		cudf::strings_column_view const &	delimiters,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Returns the number of tokens in each string of a strings column by using multiple strings delimiters to identify tokens in each string.

Also, any consecutive delimiters found in a string are ignored. This means that only empty strings or null rows will result in a token count of 0.

Example:
s = ["a", "b c", "d.e:f;"]
d = [".", ":", ";"]
t = count_tokens(s,d)
t is now [1, 1, 3]

All null row entries are ignored and the output contains all valid rows. The number of tokens for a null element is set to 0 in the output column.

Exceptions

cudf::logic_error if the delimiters column is empty or contains nulls

Parameters

input	Strings column to count tokens
delimiters	Strings used to separate each string into tokens
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New column of token counts

◆ detokenize()

std::unique_ptr<cudf::column> nvtext::detokenize	(	cudf::strings_column_view const &	input,
		cudf::column_view const &	row_indices,
		cudf::string_scalar const &	separator = `cudf::string_scalar(" ")`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Creates a strings column from a strings column of tokens and an associated column of row ids.

Multiple tokens from the input column may be combined into a single row (string) in the output column. The tokens are concatenated along with the separator string in the order in which they appear in the row_indices column.

Example:
s = ["hello", "world", "one", "two", "three"]
r = [0, 0, 1, 1, 1]
s1 = detokenize(s,r)
s1 is now ["hello world", "one two three"]
r = [0, 2, 1, 1, 0]
s2 = detokenize(s,r)
s2 is now ["hello three", "one two", "world"]

All null row entries are ignored and the output contains all valid rows. The values in row_indices are expected to have positive, sequential values without any missing row indices otherwise the output is undefined.

Exceptions

cudf::logic_error	is `separator` is invalid
cudf::logic_error	if `row_indices.size() != strings.size()`
cudf::logic_error	if `row_indices` contains nulls

Parameters

input	Strings column to detokenize
row_indices	The relative output row index assigned for each token in the input column
separator	String to append after concatenating each token to the proper output row
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New strings columns of tokens

◆ load_merge_pairs()

std::unique_ptr<bpe_merge_pairs> nvtext::load_merge_pairs	(	cudf::strings_column_view const &	merge_pairs,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Create a nvtext::bpe_merge_pairs from a strings column.

The input column should contain a unique pair of strings per line separated by a single space. An incorrect format or non-unique entries will result in undefined behavior.

Example:

merge_pairs = ["e n", "i t", "i s", "e s", "en t", "c e", "es t", "en ce", "t est", "s ent"]
mps = load_merge_pairs(merge_pairs)
// the mps object can be passed to the byte_pair_encoding API

The pairs are expected to be ordered in the file by their rank relative to each other. A pair earlier in the file has priority over any pairs below it.

Exceptions

cudf::logic_error if merge_pairs is empty or contains nulls

Parameters

merge_pairs	Column containing the unique merge pairs
stream	CUDA stream used for device memory operations and kernel launches
mr	Memory resource to allocate any returned objects

Returns: A nvtext::bpe_merge_pairs object

◆ load_merge_pairs_file()

std::unique_ptr<bpe_merge_pairs> nvtext::load_merge_pairs_file	(	std::string const &	filename_merges,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Create a nvtext::bpe_merge_pairs from an input file.

Deprecated:: Since 23.12

The file should contain a pair of strings per line separated by a single space.

Example:

e n
i t
i s
e s
en t
c e
es t
en ce
T h
Th is
t est
s ent
...

The pairs are expected to be ordered in the file by their rank relative to each other. A pair earlier in the file has priority over any pairs below it.

Parameters

filename_merges	Local file path of pairs encoded in UTF-8.
mr	Memory resource to allocate any returned objects.

Returns: A nvtext::bpe_merge_pairs object

◆ load_vocabulary()

std::unique_ptr<tokenize_vocabulary> nvtext::load_vocabulary	(	cudf::strings_column_view const &	input,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Create a tokenize_vocabulary object from a strings column.

Token ids are the row indices within the vocabulary column. Each vocabulary entry is expected to be unique otherwise the behavior is undefined.

Exceptions

cudf::logic_error if vocabulary contains nulls or is empty

Parameters

input	Strings for the vocabulary
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: Object to be used with nvtext::tokenize_with_vocabulary

◆ load_vocabulary_file()

std::unique_ptr<hashed_vocabulary> nvtext::load_vocabulary_file	(	std::string const &	filename_hashed_vocabulary,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Load the hashed vocabulary file into device memory.

The object here can be used to call the subword_tokenize without incurring the cost of loading the same file each time.

Exceptions

cudf::logic_error if the filename_hashed_vocabulary could not be opened.

Parameters

filename_hashed_vocabulary	A path to the preprocessed vocab.txt file. Note that this is the file AFTER python/perfect_hash.py has been used for preprocessing.
mr	Memory resource to allocate any returned objects.

Returns: vocabulary hash-table elements

◆ subword_tokenize()

tokenizer_result nvtext::subword_tokenize	(	cudf::strings_column_view const &	strings,
		hashed_vocabulary const &	vocabulary_table,
		uint32_t	max_sequence_length,
		uint32_t	stride,
		bool	do_lower_case,
		bool	do_truncate,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Creates a tokenizer that cleans the text, splits it into tokens and returns token-ids from an input vocabulary.

The strings are first normalized by converting to lower-case, removing punctuation, replacing a select set of multi-byte characters and whitespace characters.

The strings are then tokenized by using whitespace as a delimiter. Consecutive delimiters are ignored. Each token is then assigned a 4-byte token-id mapped from the provided vocabulary table.

Essentially each string is converted into one or more vectors of token-ids in the output column. The total number of these vectors times max_sequence_length is the size of the tensor_token_ids output column. For do_truncate==true:

size of tensor_token_ids = max_sequence_length * strings.size()
size of tensor_attention_mask = max_sequence_length * strings.size()
size of tensor_metadata = 3 * strings.size()

For do_truncate==false the number of rows per output string depends on the number of tokens resolved and the stride value which may repeat tokens in subsequent overflow rows.

This function requires about 21x the number of character bytes in the input strings column as working memory.

Exceptions

cudf::logic_error	if `stride > max_sequence_length`
std::overflow_error	if `max_sequence_length * max_rows_tensor` exceeds the column size limit

Parameters

strings	The input strings to tokenize.
vocabulary_table	The vocabulary table pre-loaded into this object.
max_sequence_length	Limit of the number of token-ids per row in final tensor for each string.
stride	Each row in the output token-ids will replicate `max_sequence_length - stride` the token-ids from the previous row, unless it is the first string.
do_lower_case	If true, the tokenizer will convert uppercase characters in the input stream to lower-case and strip accents from those characters. If false, accented and uppercase characters are not transformed.
do_truncate	If true, the tokenizer will discard all the token-ids after `max_sequence_length` for each input string. If false, it will use a new row in the output token-ids to continue generating the output.
mr	Memory resource to allocate any returned objects.

Returns: token-ids, attention-mask, and metadata

◆ tokenize() [1/2]

std::unique_ptr<cudf::column> nvtext::tokenize	(	cudf::strings_column_view const &	input,
		cudf::string_scalar const &	delimiter = `cudf::string_scalar{""}`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Returns a single column of strings by tokenizing the input strings column using the provided characters as delimiters.

The delimiter may be zero or more characters. If the delimiter is empty, whitespace (character code-point <= ' ') is used for identifying tokens. Also, any consecutive delimiters found in a string are ignored. This means only non-empty tokens are returned.

Tokens are found by locating delimiter(s) starting at the beginning of each string. As each string is tokenized, the tokens are appended using input column row order to build the output column. That is, tokens found in input row[i] will be placed in the output column directly before tokens found in input row[i+1].

Example:
s = ["a", "b c", "d  e f "]
t = tokenize(s)
t is now ["a", "b", "c", "d", "e", "f"]

All null row entries are ignored and the output contains all valid rows.

Parameters

input	Strings column to tokenize
delimiter	UTF-8 characters used to separate each string into tokens. The default of empty string will separate tokens using whitespace.
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New strings columns of tokens

◆ tokenize() [2/2]

std::unique_ptr<cudf::column> nvtext::tokenize	(	cudf::strings_column_view const &	input,
		cudf::strings_column_view const &	delimiters,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Returns a single column of strings by tokenizing the input strings column using multiple strings as delimiters.

Tokens are found by locating delimiter(s) starting at the beginning of each string. Any consecutive delimiters found in a string are ignored. This means only non-empty tokens are returned.

As each string is tokenized, the tokens are appended using input column row order to build the output column. That is, tokens found in input row[i] will be placed in the output column directly before tokens found in input row[i+1].

Example:
s = ["a", "b c", "d.e:f;"]
d = [".", ":", ";"]
t = tokenize(s,d)
t is now ["a", "b c", "d", "e", "f"]

All null row entries are ignored and the output contains all valid rows.

Exceptions

cudf::logic_error if the delimiters column is empty or contains nulls.

Parameters

input	Strings column to tokenize
delimiters	Strings used to separate individual strings into tokens
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: New strings columns of tokens

◆ tokenize_with_vocabulary()

std::unique_ptr<cudf::column> nvtext::tokenize_with_vocabulary	(	cudf::strings_column_view const &	input,
		tokenize_vocabulary const &	vocabulary,
		cudf::string_scalar const &	delimiter,
		cudf::size_type	default_id = `-1`,
		rmm::cuda_stream_view	stream = `cudf::get_default_stream()`,
		rmm::mr::device_memory_resource *	mr = `rmm::mr::get_current_device_resource()`
	)

Returns the token ids for the input string by looking up each delimited token in the given vocabulary.

Example:
s = ["hello world", "hello there", "there there world", "watch out world"]
v = load_vocabulary(["hello", "there", "world"])
r = tokenize_with_vocabulary(s,v)
r is now [[0,2], [0,1], [1,1,2], [-1,-1,2]]

Any null row entry results in a corresponding null entry in the output

Exceptions

cudf::logic_error if delimiter is invalid

Parameters

input	Strings column to tokenize
vocabulary	Used to lookup tokens within
delimiter	Used to identify tokens within `input`
default_id	The token id to be used for tokens not found in the `vocabulary`; Default is -1
stream	CUDA stream used for device memory operations and kernel launches
mr	Device memory resource used to allocate the returned column's device memory

Returns: Lists column of token ids

Files

Classes

Functions

Detailed Description

Function Documentation

◆ byte_pair_encoding()

◆ character_tokenize()

◆ count_tokens() [1/2]

◆ count_tokens() [2/2]

◆ detokenize()

◆ load_merge_pairs()

◆ load_merge_pairs_file()

◆ load_vocabulary()

◆ load_vocabulary_file()

◆ subword_tokenize()

◆ tokenize() [1/2]

◆ tokenize() [2/2]

◆ tokenize_with_vocabulary()