快捷方式

多模态转换

多模态模型转换将特定于模型的数据转换应用于每个模态,并准备 Message 对象以作为模型的输入。torchtune 目前支持文本 + 图像模型转换。这些旨在作为多模态数据集中分词器的直接替代,并支持标准 encodedecodetokenize_messages

# torchtune.models.llama3_2_vision.Llama3VisionTransform
class Llama3VisionTransform(ModelTokenizer, Transform):
    def __init__(...):
        # Text transform - standard tokenization
        self.tokenizer = llama3_tokenizer(...)
        # Image transforms
        self.transform_image = CLIPImageTransform(...)
        self.xattn_mask = VisionCrossAttentionMask(...)
from torchtune.models.llama3_2_vision import Llama3VisionTransform
from torchtune.data import Message
from PIL import Image

sample = {
    "messages": [
        Message(
            role="user",
            content=[
                {"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
                {"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
                {"type": "text", "content": "What is common in these two images?"},
            ],
        ),
        Message(
            role="assistant",
            content="A robot is in both images.",
        ),
    ],
}
transform = Llama3VisionTransform(
    path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
    tile_size=224,
    patch_size=14,
)
tokenized_dict = transform(sample)
print(transform.decode(tokenized_dict["tokens"], skip_special_tokens=False))
# '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|><|image|>What is common in these two images?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA robot is in both images.<|eot_id|>'
print(tokenized_dict["encoder_input"]["images"][0].shape)  # (num_tiles, num_channels, tile_height, tile_width)
# torch.Size([4, 3, 224, 224])

使用模型转换

您可以像使用模型分词器一样将它们传递到任何多模态数据集构建器中。

from torchtune.datasets.multimodal import the_cauldron_dataset
from torchtune.models.llama3_2_vision import Llama3VisionTransform

transform = Llama3VisionTransform(
    path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
    tile_size=224,
    patch_size=14,
)
ds = the_cauldron_dataset(
    model_transform=transform,
    subset="ai2d",
)
tokenized_dict = ds[0]
print(transform.decode(tokenized_dict["tokens"], skip_special_tokens=False))
# <|begin_of_text|><|start_header_id|>user<|end_header_id|>
#
# <|image|>Question: What do respiration and combustion give out
# Choices:
# A. Oxygen
# B. Carbon dioxide
# C. Nitrogen
# D. Heat
# Answer with the letter.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
#
# Answer: B<|eot_id|>
print(tokenized_dict["encoder_input"]["images"][0].shape)  # (num_tiles, num_channels, tile_height, tile_width)
# torch.Size([4, 3, 224, 224])

创建模型转换

模型转换预计会处理样本字典中的文本和图像。两者都应该包含在样本的 "messages" 字段中。

模型转换需要以下方法

  • tokenize_messages

  • __call__

from torchtune.modules.tokenizers import ModelTokenizer
from torchtune.modules.transforms import Transform

class MyMultimodalTransform(ModelTokenizer, Transform):
    def __init__(...):
        self.tokenizer = my_tokenizer_builder(...)
        self.transform_image = MyImageTransform(...)

    def tokenize_messages(
        self,
        messages: List[Message],
        add_eos: bool = True,
    ) -> Tuple[List[int], List[bool]]:
        # Any other custom logic here
        ...

        return self.tokenizer.tokenize_messages(
            messages=messages,
            add_eos=add_eos,
        )

    def __call__(
        self, sample: Mapping[str, Any], inference: bool = False
    ) -> Mapping[str, Any]:
        # Expected input parameters for vision encoder
        encoder_input = {"images": [], "aspect_ratio": []}
        messages = sample["messages"]

        # Transform all images in sample
        for message in messages:
            for image in message.get_media():
                out = self.transform_image({"image": image}, inference=inference)
                encoder_input["images"].append(out["image"])
                encoder_input["aspect_ratio"].append(out["aspect_ratio"])
        sample["encoder_input"] = encoder_input

        # Transform all text - returns same dictionary with additional keys "tokens" and "mask"
        sample = self.tokenizer(sample, inference=inference)

        return sample

transform = MyMultimodalTransform(...)
sample = {
    "messages": [
        Message(
            role="user",
            content=[
                {"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
                {"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
                {"type": "text", "content": "What is common in these two images?"},
            ],
        ),
        Message(
            role="assistant",
            content="A robot is in both images.",
        ),
    ],
}
tokenized_dict = transform(sample)
print(tokenized_dict)
# {'encoder_input': {'images': ..., 'aspect_ratio': ...}, 'tokens': ..., 'mask': ...}

模型转换示例

文档

访问 PyTorch 的综合开发者文档

查看文档

教程

获取针对初学者和高级开发人员的深入教程

查看教程

资源

查找开发资源并获得问题的解答

查看资源