多模态转换¶
多模态模型转换将特定于模型的数据转换应用于每个模态,并准备 Message
对象以作为模型的输入。torchtune 目前支持文本 + 图像模型转换。这些旨在作为多模态数据集中分词器的直接替代,并支持标准 encode
、decode
和 tokenize_messages
。
# torchtune.models.llama3_2_vision.Llama3VisionTransform
class Llama3VisionTransform(ModelTokenizer, Transform):
def __init__(...):
# Text transform - standard tokenization
self.tokenizer = llama3_tokenizer(...)
# Image transforms
self.transform_image = CLIPImageTransform(...)
self.xattn_mask = VisionCrossAttentionMask(...)
from torchtune.models.llama3_2_vision import Llama3VisionTransform
from torchtune.data import Message
from PIL import Image
sample = {
"messages": [
Message(
role="user",
content=[
{"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
{"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
{"type": "text", "content": "What is common in these two images?"},
],
),
Message(
role="assistant",
content="A robot is in both images.",
),
],
}
transform = Llama3VisionTransform(
path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
tile_size=224,
patch_size=14,
)
tokenized_dict = transform(sample)
print(transform.decode(tokenized_dict["tokens"], skip_special_tokens=False))
# '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|><|image|>What is common in these two images?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA robot is in both images.<|eot_id|>'
print(tokenized_dict["encoder_input"]["images"][0].shape) # (num_tiles, num_channels, tile_height, tile_width)
# torch.Size([4, 3, 224, 224])
使用模型转换¶
您可以像使用模型分词器一样将它们传递到任何多模态数据集构建器中。
from torchtune.datasets.multimodal import the_cauldron_dataset
from torchtune.models.llama3_2_vision import Llama3VisionTransform
transform = Llama3VisionTransform(
path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
tile_size=224,
patch_size=14,
)
ds = the_cauldron_dataset(
model_transform=transform,
subset="ai2d",
)
tokenized_dict = ds[0]
print(transform.decode(tokenized_dict["tokens"], skip_special_tokens=False))
# <|begin_of_text|><|start_header_id|>user<|end_header_id|>
#
# <|image|>Question: What do respiration and combustion give out
# Choices:
# A. Oxygen
# B. Carbon dioxide
# C. Nitrogen
# D. Heat
# Answer with the letter.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
#
# Answer: B<|eot_id|>
print(tokenized_dict["encoder_input"]["images"][0].shape) # (num_tiles, num_channels, tile_height, tile_width)
# torch.Size([4, 3, 224, 224])
创建模型转换¶
模型转换预计会处理样本字典中的文本和图像。两者都应该包含在样本的 "messages"
字段中。
模型转换需要以下方法
tokenize_messages
__call__
from torchtune.modules.tokenizers import ModelTokenizer
from torchtune.modules.transforms import Transform
class MyMultimodalTransform(ModelTokenizer, Transform):
def __init__(...):
self.tokenizer = my_tokenizer_builder(...)
self.transform_image = MyImageTransform(...)
def tokenize_messages(
self,
messages: List[Message],
add_eos: bool = True,
) -> Tuple[List[int], List[bool]]:
# Any other custom logic here
...
return self.tokenizer.tokenize_messages(
messages=messages,
add_eos=add_eos,
)
def __call__(
self, sample: Mapping[str, Any], inference: bool = False
) -> Mapping[str, Any]:
# Expected input parameters for vision encoder
encoder_input = {"images": [], "aspect_ratio": []}
messages = sample["messages"]
# Transform all images in sample
for message in messages:
for image in message.get_media():
out = self.transform_image({"image": image}, inference=inference)
encoder_input["images"].append(out["image"])
encoder_input["aspect_ratio"].append(out["aspect_ratio"])
sample["encoder_input"] = encoder_input
# Transform all text - returns same dictionary with additional keys "tokens" and "mask"
sample = self.tokenizer(sample, inference=inference)
return sample
transform = MyMultimodalTransform(...)
sample = {
"messages": [
Message(
role="user",
content=[
{"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
{"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
{"type": "text", "content": "What is common in these two images?"},
],
),
Message(
role="assistant",
content="A robot is in both images.",
),
],
}
tokenized_dict = transform(sample)
print(tokenized_dict)
# {'encoder_input': {'images': ..., 'aspect_ratio': ...}, 'tokens': ..., 'mask': ...}
模型转换示例¶
- Llama 3.2 Vision