transformers/src/transformers/tokenization_utils_base.py at v5.4.0 · huggingface/transformers

History

3560 lines (3055 loc) · 170 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# base

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""

Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user

fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary

of output with special method for the Fast tokenizers)

"""

from __future__ import annotations

import copy

import json

import os

import re

import warnings

from collections import OrderedDict, UserDict

from collections.abc import Callable, Collection, Mapping, Sequence, Sized

from dataclasses import dataclass

from pathlib import Path

from typing import TYPE_CHECKING, Any, NamedTuple, Union

import numpy as np

from huggingface_hub import create_repo, is_offline_mode, list_repo_files

from packaging import version

from . import __version__

from .dynamic_module_utils import custom_object_save

from .utils import (

CHAT_TEMPLATE_DIR,

CHAT_TEMPLATE_FILE,

ExplicitEnum,

PaddingStrategy,

PushToHubMixin,

TensorType,

add_end_docstrings,

cached_file,

copy_func,

extract_commit_hash,

is_mlx_available,

is_numpy_array,

is_protobuf_available,

is_tokenizers_available,

is_torch_available,

is_torch_device,

is_torch_tensor,

list_repo_templates,

logging,

requires_backends,

to_py_obj,

)

from .utils.chat_parsing_utils import recursive_parse

from .utils.chat_template_utils import render_jinja_template

from .utils.import_utils import PROTOBUF_IMPORT_ERROR

if TYPE_CHECKING:

if is_torch_available():

import torch

def import_protobuf_decode_error(error_message=""):

if is_protobuf_available():

from google.protobuf.message import DecodeError

return DecodeError

else:

raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message))

def flatten(arr: list):

res = []

if len(arr) > 0:

for sub_arr in arr:

if isinstance(arr[0], (list, tuple)):

res.extend(flatten(sub_arr))

else:

res.append(sub_arr)

return res

if is_tokenizers_available() or TYPE_CHECKING:

from tokenizers import Encoding as EncodingFast

if is_tokenizers_available():

from tokenizers import AddedToken

else:

@dataclass(frozen=False, eq=True)

class AddedToken:

"""

AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the

way it should behave.

The `normalized` will default to `not special` if it is not specified, similarly to the definition in

`tokenizers`.

"""

def __init__(

self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None

self.content = content

self.single_word = single_word

self.lstrip = lstrip

self.rstrip = rstrip

self.special = special

self.normalized = normalized if normalized is not None else not special

def __getstate__(self):

return self.__dict__

def __str__(self):

return self.content

logger = logging.get_logger(__name__)

VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input

LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

# Define type aliases and NamedTuples

TextInput = str

PreTokenizedInput = list[str]

EncodedInput = list[int]

TextInputPair = tuple[str, str]

PreTokenizedInputPair = tuple[list[str], list[str]]

EncodedInputPair = tuple[list[int], list[int]]

# Define type aliases for text-related non-text modalities

AudioInput = Union[np.ndarray, "torch.Tensor", list[np.ndarray], list["torch.Tensor"]]

# Slow tokenizers used to be saved in three separated files

SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"

ADDED_TOKENS_FILE = "added_tokens.json"

TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file

FULL_TOKENIZER_FILE = "tokenizer.json"

_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")

class TruncationStrategy(ExplicitEnum):

"""

Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in

an IDE.

"""

ONLY_FIRST = "only_first"

ONLY_SECOND = "only_second"

LONGEST_FIRST = "longest_first"

DO_NOT_TRUNCATE = "do_not_truncate"

class CharSpan(NamedTuple):

"""

Character span in the original string.

Args:

start (`int`): Index of the first character in the original string.

end (`int`): Index of the character following the last character in the original string.

"""

start: int

end: int

class TokenSpan(NamedTuple):

"""

Token span in an encoded string (list of tokens).

Args:

start (`int`): Index of the first token in the span.

end (`int`): Index of the token following the last token in the span.

"""

start: int

end: int

class BatchEncoding(UserDict):

"""

Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`],

[`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and

[`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).

This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes

utility methods to map from word/character space to token space.

Args:

data (`dict`, *optional*):

Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods

('input_ids', 'attention_mask', etc.).

encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):

If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character

space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this

information.

tensor_type (`Union[None, str, TensorType]`, *optional*):

You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at

initialization.

prepend_batch_axis (`bool`, *optional*, defaults to `False`):

Whether or not to add a batch axis when converting to tensors (see `tensor_type` above). Note that this

parameter has an effect if the parameter `tensor_type` is set, *otherwise has no effect*.

n_sequences (`Optional[int]`, *optional*):

You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at

initialization.

"""

def __init__(

self,

data: dict[str, Any] | None = None,

encoding: EncodingFast | Sequence[EncodingFast] | None = None,

tensor_type: None | str | TensorType = None,

prepend_batch_axis: bool = False,

n_sequences: int | None = None,

super().__init__(data)

# If encoding is not None, the fast tokenization is used

if encoding is not None and isinstance(encoding, EncodingFast):

encoding = [encoding]

self._encodings = encoding

if n_sequences is None and encoding is not None and encoding:

n_sequences = encoding[0].n_sequences

self._n_sequences = n_sequences

self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

@property

def n_sequences(self) -> int | None:

"""

`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this

[`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of

sentences)

"""

return self._n_sequences

def __getitem__(self, item: int | str) -> Any | EncodingFast:

"""

If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',

etc.).

If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.

If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.)

with the constraint of slice.

"""

if isinstance(item, str):

return self.data[item]

elif self._encodings is not None:

return self._encodings[item]

elif isinstance(item, slice):

return {key: self.data[key][item] for key in self.data}

else:

raise KeyError(

"Invalid key. Only three types of key are available: "

"(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting."

)

def __getattr__(self, item: str):

try:

return self.data[item]

except KeyError:

raise AttributeError

def __getstate__(self):

return {"data": self.data, "encodings": self._encodings}

def __setstate__(self, state):

if "data" in state:

self.data = state["data"]

if "encodings" in state:

self._encodings = state["encodings"]

# After this point:

# Extended properties and methods only available for fast (Rust-based) tokenizers

# provided by HuggingFace tokenizers library.

@property

def is_fast(self) -> bool:

"""

TOOD: ita i will rm this `bool`: Whether or not this BatchEncoding was created by a fast tokenizer.

"""

return self._encodings is not None

@property

def encodings(self) -> list[EncodingFast] | None:

"""

`Optional[list[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if

the input was tokenized through Python (i.e., not a fast) tokenizer.

"""

return self._encodings

def tokens(self, batch_index: int = 0) -> list[str]:

"""

Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to

integer indices) at a given batch index (only works for the output of a fast tokenizer).

Args:

batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

Returns:

`list[str]`: The list of tokens at that index.

"""

if not self._encodings:

raise ValueError(

"tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"

" class)."

)

return self._encodings[batch_index].tokens

def sequence_ids(self, batch_index: int = 0) -> list[int | None]:

"""

Return a list mapping the tokens to the id of their original sentences:

- `None` for special tokens added around or between sequences,

- `0` for tokens corresponding to words in the first sequence,

- `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly

encoded.

Args:

batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

Returns:

`list[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added

by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding

sequence.

"""

if not self._encodings:

raise ValueError(

"sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"

" class)."

)

return self._encodings[batch_index].sequence_ids

def word_ids(self, batch_index: int = 0) -> list[int | None]:

"""

Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

Args:

batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

Returns:

`list[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the

tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word

(several tokens will be mapped to the same word index if they are parts of that word).

"""

if not self._encodings:

raise ValueError(

"word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"

" class)."

)

return self._encodings[batch_index].word_ids

def token_to_sequence(self, batch_or_token_index: int, token_index: int | None = None) -> int:

"""

Get the index of the sequence represented by the given token. In the general use case, this method returns `0`

for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair

Can be called as:

- `self.token_to_sequence(token_index)` if batch size is 1

- `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1

This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,

words are defined by the user). In this case it allows to easily associate encoded tokens with provided

tokenized words.

Args:

batch_or_token_index (`int`):

Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of

the token in the sequence.

token_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the

sequence.

Returns:

`int`: Index of the word in the input sequence.

"""

if not self._encodings:

raise ValueError("token_to_sequence() is not available when using Python based tokenizers")

if token_index is not None:

batch_index = batch_or_token_index

else:

batch_index = 0

token_index = batch_or_token_index

if batch_index < 0:

batch_index = self._batch_size + batch_index

if token_index < 0:

token_index = self._seq_len + token_index

return self._encodings[batch_index].token_to_sequence(token_index)

def token_to_word(self, batch_or_token_index: int, token_index: int | None = None) -> int:

"""

Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.

Can be called as:

- `self.token_to_word(token_index)` if batch size is 1

- `self.token_to_word(batch_index, token_index)` if batch size is greater than 1

This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,

words are defined by the user). In this case it allows to easily associate encoded tokens with provided

tokenized words.

Args:

batch_or_token_index (`int`):

Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of

the token in the sequence.

token_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the

sequence.

Returns:

`int`: Index of the word in the input sequence.

"""

if not self._encodings:

raise ValueError("token_to_word() is not available when using Python based tokenizers")

if token_index is not None:

batch_index = batch_or_token_index

else:

batch_index = 0

token_index = batch_or_token_index

if batch_index < 0:

batch_index = self._batch_size + batch_index

if token_index < 0:

token_index = self._seq_len + token_index

return self._encodings[batch_index].token_to_word(token_index)

def word_to_tokens(

self, batch_or_word_index: int, word_index: int | None = None, sequence_index: int = 0

) -> TokenSpan | None:

"""

Get the encoded token span corresponding to a word in a sequence of the batch.

Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:

- **start** -- Index of the first token.

- **end** -- Index of the token following the last token.

Can be called as:

- `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1

- `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to

This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words

are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized

words.

Args:

batch_or_word_index (`int`):

Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of

the word in the sequence.

word_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the

sequence.

sequence_index (`int`, *optional*, defaults to 0):

If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0

or 1) the provided word index belongs to.

Returns:

([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns

`None` if no tokens correspond to the word. This can happen especially when the token is a special token

that has been used to format the tokenization. For example when we add a class token at the very beginning

of the tokenization.

"""

if not self._encodings:

raise ValueError("word_to_tokens() is not available when using Python based tokenizers")

if word_index is not None:

batch_index = batch_or_word_index

else:

batch_index = 0

word_index = batch_or_word_index

if batch_index < 0:

batch_index = self._batch_size + batch_index

if word_index < 0:

word_index = self._seq_len + word_index

span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)

return TokenSpan(*span) if span is not None else None

def token_to_chars(self, batch_or_token_index: int, token_index: int | None = None) -> CharSpan | None:

"""

Get the character span corresponding to an encoded token in a sequence of the batch.

Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:

- **start** -- Index of the first character in the original string associated to the token.

- **end** -- Index of the character following the last character in the original string associated to the

token.

Can be called as:

- `self.token_to_chars(token_index)` if batch size is 1

- `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1

Args:

batch_or_token_index (`int`):

Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of

the token in the sequence.

token_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in

the sequence.

Returns:

[`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token

(e.g. <s>, </s>) doesn't correspond to any chars in the origin string.

"""

if not self._encodings:

raise ValueError("token_to_chars() is not available when using Python based tokenizers")

if token_index is not None:

batch_index = batch_or_token_index

else:

batch_index = 0

token_index = batch_or_token_index

span_indices = self._encodings[batch_index].token_to_chars(token_index)

return CharSpan(*span_indices) if span_indices is not None else None

def char_to_token(self, batch_or_char_index: int, char_index: int | None = None, sequence_index: int = 0) -> int:

"""

Get the index of the token in the encoded output comprising a character in the original string for a sequence

of the batch.

Can be called as:

- `self.char_to_token(char_index)` if batch size is 1

- `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1

This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words

are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized

words.

Args:

batch_or_char_index (`int`):

Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of

the word in the sequence

char_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the

sequence.

sequence_index (`int`, *optional*, defaults to 0):

If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0

or 1) the provided character index belongs to.

Returns:

`int`: Index of the token, or None if the char index refers to a whitespace only token and whitespace is

trimmed with `trim_offsets=True`.

"""

if not self._encodings:

raise ValueError("char_to_token() is not available when using Python based tokenizers")

if char_index is not None:

batch_index = batch_or_char_index

else:

batch_index = 0

char_index = batch_or_char_index

return self._encodings[batch_index].char_to_token(char_index, sequence_index)

def word_to_chars(

self, batch_or_word_index: int, word_index: int | None = None, sequence_index: int = 0

) -> CharSpan:

"""

Get the character span in the original string corresponding to given word in a sequence of the batch.

Character spans are returned as a CharSpan NamedTuple with:

- start: index of the first character in the original string

- end: index of the character following the last character in the original string

Can be called as:

- `self.word_to_chars(word_index)` if batch size is 1

- `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1

Args:

batch_or_word_index (`int`):

Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of

the word in the sequence

word_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the

sequence.

sequence_index (`int`, *optional*, defaults to 0):

If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0

or 1) the provided word index belongs to.

Returns:

`CharSpan` or `list[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan

are NamedTuple with:

- start: index of the first character associated to the token in the original string

- end: index of the character following the last character associated to the token in the original

string

"""

if not self._encodings:

raise ValueError("word_to_chars() is not available when using Python based tokenizers")

if word_index is not None:

batch_index = batch_or_word_index

else:

batch_index = 0

word_index = batch_or_word_index

return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))

def char_to_word(self, batch_or_char_index: int, char_index: int | None = None, sequence_index: int = 0) -> int:

"""

Get the word in the original string corresponding to a character in the original string of a sequence of the

batch.

Can be called as:

- `self.char_to_word(char_index)` if batch size is 1

- `self.char_to_word(batch_index, char_index)` if batch size is greater than 1

This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words

are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized

words.

Args:

batch_or_char_index (`int`):

Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of

the character in the original string.

char_index (`int`, *optional*):

If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the

original string.

sequence_index (`int`, *optional*, defaults to 0):

If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0

or 1) the provided character index belongs to.

Returns:

`int` or `list[int]`: Index or indices of the associated encoded token(s).

"""

if not self._encodings:

raise ValueError("char_to_word() is not available when using Python based tokenizers")

if char_index is not None:

batch_index = batch_or_char_index

else:

batch_index = 0

char_index = batch_or_char_index

return self._encodings[batch_index].char_to_word(char_index, sequence_index)

def convert_to_tensors(self, tensor_type: str | TensorType | None = None, prepend_batch_axis: bool = False):

"""

Convert the inner content to tensors.

Args:

tensor_type (`str` or [`~utils.TensorType`], *optional*):

The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If

`None`, no modification is done.

prepend_batch_axis (`int`, *optional*, defaults to `False`):

Whether or not to add the batch dimension during the conversion.

"""

if tensor_type is None:

return self

# Convert to TensorType

if not isinstance(tensor_type, TensorType):

tensor_type = TensorType(tensor_type)

if tensor_type == TensorType.PYTORCH:

if not is_torch_available():

raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")

import torch

def as_tensor(value, dtype=None):

if isinstance(value, list) and len(value) > 0 and isinstance(value[0], np.ndarray):

return torch.from_numpy(np.array(value))

if len(flatten(value)) == 0 and dtype is None:

dtype = torch.int64

return torch.tensor(value, dtype=dtype)

is_tensor = torch.is_tensor

elif tensor_type == TensorType.MLX:

if not is_mlx_available():

raise ImportError("Unable to convert output to MLX tensors format, MLX is not installed.")

import mlx.core as mx

def as_tensor(value, dtype=None):

if len(flatten(value)) == 0 and dtype is None:

dtype = mx.int32

return mx.array(value, dtype=dtype)

def is_tensor(obj):

return isinstance(obj, mx.array)

else:

def as_tensor(value, dtype=None):

if (

isinstance(value, (list, tuple))

and len(value) > 0

and isinstance(value[0], (list, tuple, np.ndarray))

value_lens = [len(val) for val in value]

if len(set(value_lens)) > 1 and dtype is None:

# we have a ragged list so handle explicitly

value = as_tensor([np.asarray(val) for val in value], dtype=object)

if len(flatten(value)) == 0 and dtype is None:

dtype = np.int64

return np.asarray(value, dtype=dtype)

is_tensor = is_numpy_array

# Do the tensor conversion in batch

for key, value in self.items():

try:

if prepend_batch_axis:

value = [value]

if not is_tensor(value):

tensor = as_tensor(value)

# Removing this for now in favor of controlling the shape with `prepend_batch_axis`

# # at-least2d

# if tensor.ndim > 2:

# tensor = tensor.squeeze(0)

# elif tensor.ndim < 2:

# tensor = tensor[None, :]

self[key] = tensor

except Exception as e:

if key == "overflowing_tokens":

raise ValueError(

"Unable to create tensor returning overflowing tokens of different lengths. "

"Please see if a fast version of this tokenizer is available to have this feature available."

) from e

raise ValueError(

"Unable to create tensor, you should probably activate truncation and/or padding with"

" 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"

f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"

" expected)."

) from e

return self

def to(self, device: str | torch.device, *, non_blocking: bool = False) -> BatchEncoding:

"""

Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only).

Args:

device (`str` or `torch.device`): The device to put the tensors on.

non_blocking (`bool`): Whether to perform the copy asynchronously.

Returns:

[`BatchEncoding`]: The same instance after modification.

"""

requires_backends(self, ["torch"])

# This check catches things like APEX blindly calling "to" on all inputs to a module

# Otherwise it passes the casts down and casts the LongTensor containing the token idxs

# into a HalfTensor

if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):

self.data = {

k: v.to(device=device, non_blocking=non_blocking) if hasattr(v, "to") and callable(v.to) else v

for k, v in self.data.items()

}

else:

logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")

return self

ENCODE_KWARGS_DOCSTRING = r"""

add_special_tokens (`bool`, *optional*, defaults to `True`):

Whether or not to add special tokens when encoding the sequences. This will use the underlying

`PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are

automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens

automatically.

padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):

Activates and controls padding. Accepts the following values:

- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single

sequence is provided).

- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum

acceptable input length for the model if that argument is not provided.

- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different

lengths).

truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):

Activates and controls truncation. Accepts the following values:

- `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or

to the maximum acceptable input length for the model if that argument is not provided. This will

truncate token by token, removing a token from the longest sequence in the pair if a pair of

sequences (or a batch of pairs) is provided.

- `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the

maximum acceptable input length for the model if that argument is not provided. This will only

truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.

- `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the

maximum acceptable input length for the model if that argument is not provided. This will only

truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.

- `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths

greater than the model maximum admissible input size).

max_length (`int`, *optional*):

Controls the maximum length to use by one of the truncation/padding parameters.

If left unset or set to `None`, this will use the predefined model maximum length if a maximum length

is required by one of the truncation/padding parameters. If the model has no specific maximum input

length (like XLNet) truncation/padding to a maximum length will be deactivated.

stride (`int`, *optional*, defaults to 0):

If set to a number along with `max_length`, the overflowing tokens returned when

`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence

returned to provide some overlap between truncated and overflowing sequences. The value of this

argument defines the number of overlapping tokens.

is_split_into_words (`bool`, *optional*, defaults to `False`):

Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the

tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)

which it will tokenize. This is useful for NER or token classification.

pad_to_multiple_of (`int`, *optional*):

If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.

This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability

`>= 7.5` (Volta).

padding_side (`str`, *optional*):

The side on which the model should have padding applied. Should be selected between ['right', 'left'].

Default value is picked from the class attribute of the same name.

return_tensors (`str` or [`~utils.TensorType`], *optional*):

If set, will return tensors instead of list of python integers. Acceptable values are:

- `'pt'`: Return PyTorch `torch.Tensor` objects.

- `'np'`: Return Numpy `np.ndarray` objects.

"""

ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""

return_token_type_ids (`bool`, *optional*):

Whether to return token type IDs. If left to the default, will return the token type IDs according to

the specific tokenizer's default, defined by the `return_outputs` attribute.

[What are token type IDs?](../glossary#token-type-ids)

return_attention_mask (`bool`, *optional*):

Whether to return the attention mask. If left to the default, will return the attention mask according

to the specific tokenizer's default, defined by the `return_outputs` attribute.

[What are attention masks?](../glossary#attention-mask)

return_overflowing_tokens (`bool`, *optional*, defaults to `False`):

Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch

of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead

of returning overflowing tokens.

return_special_tokens_mask (`bool`, *optional*, defaults to `False`):

Whether or not to return special tokens mask information.

return_offsets_mapping (`bool`, *optional*, defaults to `False`):

Whether or not to return `(char_start, char_end)` for each token.

This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using

Python's tokenizer, this method will raise `NotImplementedError`.

return_length (`bool`, *optional*, defaults to `False`):

Whether or not to return the lengths of the encoded inputs.

verbose (`bool`, *optional*, defaults to `True`):

Whether or not to print more information and warnings.

**kwargs: passed to the `self.tokenize()` method

Return:

[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

- **input_ids** -- List of token ids to be fed to a model.

[What are input IDs?](../glossary#input-ids)

- **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or

if *"token_type_ids"* is in `self.model_input_names`).

[What are token type IDs?](../glossary#token-type-ids)

- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when

`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).

[What are attention masks?](../glossary#attention-mask)

- **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and

`return_overflowing_tokens=True`).

- **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and

`return_overflowing_tokens=True`).

- **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying

regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).

- **length** -- The length of the inputs (when `return_length=True`)

"""

INIT_TOKENIZER_DOCSTRING = r"""

Class attributes (overridden by derived classes)

- **vocab_files_names** (`dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each

vocabulary file required by the model, and as associated values, the filename for saving the associated file

(string).

- **pretrained_vocab_files_map** (`dict[str, dict[str, str]]`) -- A dictionary of dictionaries, with the

high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the

low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the

associated pretrained vocabulary file.

- **model_input_names** (`list[str]`) -- A list of inputs expected in the forward pass of the model.

- **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.

Should be `'right'` or `'left'`.

- **truncation_side** (`str`) -- The default value for the side on which the model should have truncation

applied. Should be `'right'` or `'left'`.

Args:

model_max_length (`int`, *optional*):

The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is

loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the

value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will

default to VERY_LARGE_INTEGER (`int(1e30)`).

padding_side (`str`, *optional*):

The side on which the model should have padding applied. Should be selected between ['right', 'left'].

Default value is picked from the class attribute of the same name.

truncation_side (`str`, *optional*):

The side on which the model should have truncation applied. Should be selected between ['right', 'left'].

Default value is picked from the class attribute of the same name.

chat_template (`str`, *optional*):

A Jinja template string that will be used to format lists of chat messages. See

https://huggingface.co/docs/transformers/chat_templating for a full description.

model_input_names (`list[string]`, *optional*):

The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or

`"attention_mask"`). Default value is picked from the class attribute of the same name.

bos_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token representing the beginning of a sentence.

eos_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token representing the end of a sentence.

unk_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token representing an out-of-vocabulary token.

sep_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token separating two different sentences in the same input (used by BERT for instance).

pad_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by

attention mechanisms or loss computation.

cls_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token representing the class of the input (used by BERT for instance).

mask_token (`str` or `tokenizers.AddedToken`, *optional*):

A special token representing a masked token (used by masked-language modeling pretraining objectives, like

BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.

extra_special_tokens (list of `str` or `tokenizers.AddedToken`, *optional*):

A list of extra model-specific special tokens. Add them here to ensure they are skipped when decoding with

`skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end

of the vocabulary.

split_special_tokens (`bool`, *optional*, defaults to `False`):

Whether or not the special tokens should be split during the tokenization process. Passing will affect the

internal state of the tokenizer. The default behavior is to not split special tokens. This means that if

`<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") = ['<s>`]. Otherwise, if

`split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<','s', '>']`.

"""

@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)

class PreTrainedTokenizerBase(PushToHubMixin):

"""

Base class for all tokenizer backends.

"""

vocab_files_names: dict[str, str] = {}

pretrained_vocab_files_map: dict[str, dict[str, str]] = {}

_auto_class: str | None = None

# first name has to correspond to main model input name

# to make sure `tokenizer.pad(...)` works correctly

model_input_names: list[str] = ["input_ids", "attention_mask"]

padding_side: str = "right"

truncation_side: str = "right"

slow_tokenizer_class = None

# Special tokens support (moved from SpecialTokensMixin)

# V5: Clean separation of named special tokens from extra special tokens

SPECIAL_TOKENS_ATTRIBUTES = [

"bos_token",

"eos_token",

"unk_token",

"sep_token",

"pad_token",

"cls_token",

"mask_token",

]

def __init__(self, **kwargs):

self.init_inputs = ()

for key in kwargs:

if hasattr(self, key) and callable(getattr(self, key)):

raise AttributeError(f"{key} conflicts with the method {key} in {self.__class__.__name__}")

# V5: Convert deprecated additional_special_tokens to extra_special_tokens before storing init_kwargs

if "additional_special_tokens" in kwargs and "extra_special_tokens" not in kwargs:

kwargs["extra_special_tokens"] = kwargs.pop("additional_special_tokens")

self.init_kwargs = copy.deepcopy(kwargs)

self.name_or_path = kwargs.pop("name_or_path", "")

self._processor_class = kwargs.pop("processor_class", None)

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

tokenization_utils_base.py

Latest commit

History

tokenization_utils_base.py

File metadata and controls