Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add :skip_utf8_validation flag to encode #61

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lib/tokenizers/tokenizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,10 @@ defmodule Tokenizers.Tokenizer do
* `:add_special_tokens` - whether to add special tokens to the
sequence. Defaults to `true`

* `:skip_utf8_validation` - whether to skip utf8 validation.
Defaults to `false`. Disabling this and passing invalid strings
may lead to errors (including segmentation fault)

* `:encoding_transformations` - a list of `t:Tokenizers.Encoding.Transformation.t/0`
to apply to the encoding. Check `Tokenizers.Encoding.transform/2`
for more information. Defaults to `[]`
Expand Down
99 changes: 66 additions & 33 deletions native/ex_tokenizers/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::collections::HashMap;
use std::ops::Deref;
use std::panic;

use rustler::{NifTaggedEnum, Term};
use rustler::{Binary, NifTaggedEnum, Term};

use tokenizers::models::wordpiece::WordPieceTrainerBuilder;
use tokenizers::models::TrainerWrapper;
Expand Down Expand Up @@ -425,36 +425,79 @@ fn term_to_encode_input<'a, 'b>(term: &'a Term<'b>) -> Result<EncodeInput<'b>, E
}
}

fn unsafe_term_to_encode_input<'a, 'b>(
term: &'a Term<'b>,
) -> Result<EncodeInput<'b>, ExTokenizersError> {
if let Ok(bin) = term.decode::<Binary>() {
let slice: &'b [u8] = bin.as_slice();
let string = unsafe { std::str::from_utf8_unchecked(slice) };
Ok(EncodeInput::Single(string.into()))
} else if let Ok((bin1, bin2)) = term.decode::<(Binary, Binary)>() {
let slice1: &'b [u8] = bin1.as_slice();
let string1 = unsafe { std::str::from_utf8_unchecked(slice1) };
let slice2: &'b [u8] = bin2.as_slice();
let string2 = unsafe { std::str::from_utf8_unchecked(slice2) };
Ok(EncodeInput::Dual(string1.into(), string2.into()))
} else {
Err(ExTokenizersError::Other(String::from(
"input must be either a string (valid UTF-8 encoded) or a tuple",
)))
}
}

#[derive(NifTaggedEnum)]
pub enum EncodeOption {
SkipUTF8Validation(bool),
AddSpecialTokens(bool),
EncodingTransformations(Vec<TransformationElement>),
}

struct EncodeConfig {
skip_utf8_validation: bool,
add_special_tokens: bool,
encoding_transformations: Vec<TransformationElement>,
}

impl From<Vec<EncodeOption>> for EncodeConfig {
fn from(options: Vec<EncodeOption>) -> Self {
let mut config = EncodeConfig {
skip_utf8_validation: false,
add_special_tokens: true,
encoding_transformations: Vec::new(),
};

for option in options {
match option {
EncodeOption::SkipUTF8Validation(val) => {
config.skip_utf8_validation = val;
}
EncodeOption::AddSpecialTokens(val) => {
config.add_special_tokens = val;
}
EncodeOption::EncodingTransformations(transformations) => {
config.encoding_transformations = transformations;
}
}
}

config
}
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn tokenizer_encode(
tokenizer: ExTokenizersTokenizer,
input: Term,
options: Vec<EncodeOption>,
) -> Result<ExTokenizersEncoding, ExTokenizersError> {
struct Opts {
add_special_tokens: bool,
encoding_transformations: Vec<TransformationElement>,
}
let mut opts = Opts {
add_special_tokens: true,
encoding_transformations: Vec::new(),
let opts: EncodeConfig = options.into();

let input = if opts.skip_utf8_validation {
unsafe_term_to_encode_input(&input)?
} else {
term_to_encode_input(&input)?
};
options.into_iter().for_each(|option| match option {
EncodeOption::AddSpecialTokens(add_special_tokens) => {
opts.add_special_tokens = add_special_tokens
}
EncodeOption::EncodingTransformations(encoding_transformations) => {
opts.encoding_transformations = encoding_transformations
}
});

let input = term_to_encode_input(&input)?;
let mut encoding = tokenizer
.resource
.0
Expand All @@ -470,25 +513,15 @@ pub fn tokenizer_encode_batch(
options: Vec<EncodeOption>,
// add_special_tokens: bool,
) -> Result<Vec<ExTokenizersEncoding>, ExTokenizersError> {
struct Opts {
add_special_tokens: bool,
encoding_transformations: Vec<TransformationElement>,
}
let mut opts = Opts {
add_special_tokens: true,
encoding_transformations: Vec::new(),
let opts: EncodeConfig = options.into();
let callback = if opts.skip_utf8_validation {
unsafe_term_to_encode_input
} else {
term_to_encode_input
};
options.into_iter().for_each(|option| match option {
EncodeOption::AddSpecialTokens(add_special_tokens) => {
opts.add_special_tokens = add_special_tokens
}
EncodeOption::EncodingTransformations(encoding_transformations) => {
opts.encoding_transformations = encoding_transformations
}
});
let inputs = inputs
.iter()
.map(term_to_encode_input)
.map(callback)
.collect::<Result<Vec<EncodeInput>, ExTokenizersError>>()?;
let mut encodings = tokenizer
.resource
Expand Down
7 changes: 7 additions & 0 deletions test/tokenizers/tokenizer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,13 @@ defmodule Tokenizers.TokenizerTest do
describe "encode/decode" do
test "can encode a single string", %{tokenizer: tokenizer} do
assert {:ok, %Tokenizers.Encoding{}} = Tokenizer.encode(tokenizer, "This is a test")

assert {:ok, %Tokenizers.Encoding{}} =
Tokenizer.encode(tokenizer, "This is a test", skip_utf8_validation: true)
end

test "errors when encoding a binary", %{tokenizer: tokenizer} do
assert {:error, _} = Tokenizer.encode(tokenizer, <<0xFF>>)
end

test "can apply transformations to encoding", %{tokenizer: tokenizer} do
Expand Down
Loading