|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
| 3 | +import os |
3 | 4 | import dataclasses |
4 | 5 | from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, Protocol |
| 6 | + |
5 | 7 | from . import llama_types |
6 | 8 | from . import llama |
7 | 9 |
|
@@ -327,6 +329,26 @@ def get_chat_format(name: str): |
327 | 329 | ) |
328 | 330 |
|
329 | 331 |
|
| 332 | +def hf_autotokenizer_to_chat_formatter(pretrained_model_name_or_path: Union[str, os.PathLike[str]]) -> ChatFormatter: |
| 333 | + # https://huggingface.co/docs/transformers/main/chat_templating |
| 334 | + # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format |
| 335 | + # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json |
| 336 | + from transformers import AutoTokenizer |
| 337 | + |
| 338 | + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) |
| 339 | + |
| 340 | + def format_autotokenizer( |
| 341 | + messages: List[llama_types.ChatCompletionRequestMessage], |
| 342 | + **kwargs: Any, |
| 343 | + ) -> ChatFormatterResponse: |
| 344 | + tokenizer.use_default_system_prompt = False |
| 345 | + _prompt = tokenizer.apply_chat_template(messages, tokenize=False) |
| 346 | + # Return formatted prompt and eos token by default |
| 347 | + return ChatFormatterResponse(prompt=_prompt, stop=tokenizer.eos_token) |
| 348 | + |
| 349 | + return format_autotokenizer |
| 350 | + |
| 351 | + |
330 | 352 | # see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/tokenization_llama.py |
331 | 353 | # system prompt is "embedded" in the first message |
332 | 354 | @register_chat_format("llama-2") |
@@ -510,26 +532,6 @@ def format_chatml( |
510 | 532 | _prompt = _format_chatml(system_message, _messages, _sep) |
511 | 533 | return ChatFormatterResponse(prompt=_prompt) |
512 | 534 |
|
513 | | -# eg, export HF_MODEL=mistralai/Mistral-7B-Instruct-v0.1 |
514 | | -@register_chat_format("autotokenizer") |
515 | | -def format_autotokenizer( |
516 | | - messages: List[llama_types.ChatCompletionRequestMessage], |
517 | | - **kwargs: Any, |
518 | | -) -> ChatFormatterResponse: |
519 | | - # https://huggingface.co/docs/transformers/main/chat_templating |
520 | | - # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1#instruction-format |
521 | | - # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/blob/main/tokenizer_config.json |
522 | | - import os |
523 | | - from transformers import AutoTokenizer |
524 | | - huggingFaceModel = os.getenv("HF_MODEL") # eg, mistralai/Mistral-7B-Instruct-v0.1 |
525 | | - print(huggingFaceModel) |
526 | | - if not huggingFaceModel: |
527 | | - raise Exception("HF_MODEL needs to be set in env to use chat format 'autotokenizer'") |
528 | | - tokenizer = AutoTokenizer.from_pretrained(huggingFaceModel) |
529 | | - tokenizer.use_default_system_prompt = False |
530 | | - _prompt = tokenizer.apply_chat_template(messages, tokenize=False) |
531 | | - # Return formatted prompt and eos token by default |
532 | | - return ChatFormatterResponse(prompt=_prompt, stop=tokenizer.eos_token) |
533 | 535 |
|
534 | 536 | @register_chat_completion_handler("functionary") |
535 | 537 | def functionary_chat_handler( |
|
0 commit comments