01引言
近期Internvl2.5发布,性能与GPT-4o和Claude-3.5-sonnet等领先的商业模型相媲美,成为首个在MMMU上超过70%的开源模型,通过链式思考(CoT)推理实现了3.7个百分点的提升,展示了强大的测试时间可扩展性潜力。InternVL 2.5是基于InternVL 2.0发展而来,通过增强训练和测试策略以及提高数据质量来进一步提升性能。该模型在多个方面进行了优化,包括视觉编码器、语言模型、数据集大小和测试时间配置等方面的研究,旨在探索模型规模与性能之间的关系。InternVL 2.5经过广泛的评估,在多项基准测试中展现了竞争性的性能,特别是在多学科推理、文档理解、多图像/视频理解、现实世界理解、多模态幻觉检测、视觉地面化、多语言能力以及纯语言处理等领域。
📕 课代表划重点:
1. InternVL 2.5的开源多模态大型语言模型发布,通过数据和测试时间扩展提高了性能边界
2. 实验表明,InternVL 2.5在多种基准测试中表现出竞争力,并超越了商业模型GPT-4o和Claude-3.5-Sonnet
3. 该模型采用了新的训练和测试策略以及高质量的数据集,能够处理多种模态的信息,包括文本、图像和视频等
4. 通过链式思维推理等方式,该模型在MMMU基准测试中实现了超过70%的准确率,展现了强大的测试时间扩展潜力
5. 该研究为开放源代码社区提供了一个新标准,用于开发和应用多模态AI系统
InternVL 2.5保留了其前身的相同模型架构:InternVL 1.5 和 InternVL 2.0 ,遵循各种 MLLM 研究中广泛采用的“ViT-MLP-LLM”范式。InternVL 2.5实现将一种新的增量预训练的InternViT-6B或InternViT-300M与各种不同大小和类型的预先训练的LLMs集成在一起,包括InternLM2.5和Qwen 2.5,使用随机初始化的两层MLP投影器。正如之前的版本一样,为了增强高分辨率处理的可扩展性,研究团队简单地应用了一个像素无序操作,将视觉令牌的数量减少到原始数量的一半。因此,在InternVL 2.5中,一个448×448图像块由256个视觉令牌表示。
模型链接:
https://www.modelscope.cn/collections/InternVL-25-fbde6e47302942
02模型下载
命令行下载:
modelscope download --model OpenGVLab/InternVL2_5-4B
Python SDK下载:
#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('OpenGVLab/InternVL2_5-4B')
03模型推理
transformers推理
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from modelscope import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = 'OpenGVLab/InternVL2_5-4B'
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# set the max number of tiles in `max_num`
pixel_values = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
# pure-text conversation (纯文本对话)
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# single-image single-round conversation (单图单轮对话)
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# batch inference, single image per sample (单图批处理)
pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
num_patches_list=num_patches_list,
questions=questions,
generation_config=generation_config)
for question, response in zip(questions, responses):
print(f'User: {question}\nAssistant: {response}')
# video multi-round conversation (视频多轮对话)
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = build_transform(input_size=input_size)
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
video_path = './showcase.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Describe this video in detail. Don\'t repeat.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
流式输出:
from transformers import TextIteratorStreamer
from threading import Thread
# Initialize the streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
# Define the generation configuration
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
# Start the model chat in a separate thread
thread = Thread(target=model.chat, kwargs=dict(
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
history=None, return_history=False, generation_config=generation_config,
))
thread.start()
# Initialize an empty string to store the generated text
generated_text = ''
# Loop through the streamer to get the new text as it is generated
for new_text in streamer:
if new_text == model.conv_template.sep:
break
generated_text += new_text
print(new_text, end='', flush=True) # Print each new chunk of generated text on the same line
显存占用:
lmdeploy推理
安装依赖:
pip install lmdeploy -U
示例代码:
from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image
from modelscope import snapshot_download
model = snapshot_download('OpenGVLab/InternVL2_5-4B')
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
response = pipe(('describe this image', image))
print(response.text)
lmdeploy部署本地服务:
lmdeploy serve api_server ./InternVL2_5-4B/ --backend turbomind --server-port 23333
推理服务:
from openai import OpenAI
client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
model_name = client.models.list().data[0].id
response = client.chat.completions.create(
model=model_name,
messages=[{
'role':
'user',
'content': [{
'type': 'text',
'text': 'describe this image',
}, {
'type': 'image_url',
'image_url': {
'url':
'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg',
},
}],
}],
temperature=0.8,
top_p=0.8)
print(response)
04 模型训练
我们使用ms-swift 3.0对InternVL2.5-2B进行微调. ms-swift是魔搭社区官方提供的LLM与多模态LLM微调部署框架,支持400+LLM和100+多模态LLM。
这里我们使用python的方式对InternVL2.5-2B进行Latex-OCR的微调。借此我们可以快速了解微调中的一些细节,这对自定义训练过程很有帮助。
如果出现兼容问题,请关注:
https://github.com/modelscope/ms-swift/tree/main/examples/train/notebook
首先我们需要从源代码安装ms-swift3.0
git clone https://github.com/modelscope/ms-swift.git
cd ms-swift
pip install -e '.[llm]'
首先我们需要导入一些包:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.llm import (
get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch,
get_multimodal_target_regex, LazyLLMDataset
)
from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything
from swift.tuners import Swift, LoraConfig
from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from functools import partial
logger = get_logger()
seed_everything(42)
设置一些训练的超参数:
# model
model_id_or_path = 'OpenGVLab/InternVL2_5-2B'
system = None # 使用template中定义的默认system
output_dir = 'output/InternVL2_5-2B'
# dataset
dataset = ['AI-ModelScope/LaTeX_OCR#20000'] # dataset_id或者dataset_path。这里我们采样20000条数据样本
data_seed = 42
max_length = 8192
split_dataset_ratio = 0.01 # 切分验证集的比例
num_proc = 4 # 数据处理的进程数
strict = False
# lora
lora_rank = 8
lora_alpha = 32
freeze_llm = False
freeze_vit = True
freeze_aligner = True
# training_args
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
learning_rate=1e-4,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_checkpointing=True,
weight_decay=0.1,
lr_scheduler_type='cosine',
warmup_ratio=0.05,
report_to=['tensorboard'],
logging_first_step=True,
save_strategy='steps',
save_steps=100,
eval_strategy='steps',
eval_steps=100,
gradient_accumulation_steps=16,
# 为了更快查看训练效果,这里设置为1。通常情况下,需要设置为更大的数。
num_train_epochs=1,
metric_for_best_model='loss',
save_total_limit=2,
logging_steps=5,
dataloader_num_workers=4,
data_seed=data_seed,
remove_unused_columns=False
)
output_dir = os.path.abspath(os.path.expanduser(output_dir))
logger.info(f'output_dir: {output_dir}')
准备模型和对话模板:
# 获取model和template
model, processor = get_model_tokenizer(model_id_or_path)
logger.info(f'model_info: {model.model_info}')
template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length)
template.set_mode('train')
# 获取target_modules并在模型中加入可训练的LoRA模块
model_arch = get_model_arch(model.model_meta.model_arch)
target_modules = get_multimodal_target_regex(model_arch, freeze_llm=freeze_llm, freeze_vit=freeze_vit,
freeze_aligner=freeze_aligner)
lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha,
target_modules=target_modules)
model = Swift.prepare_model(model, lora_config)
logger.info(f'lora_config: {lora_config}')
# 打印模型结构和可训练参数
logger.info(f'model: {model}')
model_parameter_info = get_model_parameter_info(model)
logger.info(f'model_parameter_info: {model_parameter_info}')
准备训练和验证数据集:
# 下载并载入数据集,切分成训练集和验证集
train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,
strict=strict, seed=data_seed)
logger.info(f'train_dataset: {train_dataset}')
logger.info(f'val_dataset: {val_dataset}')
logger.info(f'train_dataset[0]: {train_dataset[0]}')
# 将文本encode成tokens
train_dataset = LazyLLMDataset(
train_dataset, template.encode, strict=strict, random_state=data_seed)
val_dataset = LazyLLMDataset(
val_dataset, template.encode, strict=strict, random_state=data_seed)
data = train_dataset[0]
logger.info(f'encoded_train_dataset[0]: {data}')
template.print_inputs(data)
使用trainer开启训练:
model.enable_input_require_grads() # 兼容gradient checkpointing
template.register_post_encode_hook([model]) # 将post_encode注册到forward_pre_hook中
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=template.data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
template=template,
)
trainer.model_accepts_loss_kwargs = True # 兼容transformers>=4.46
trainer.train()
last_model_checkpoint = trainer.state.last_model_checkpoint
logger.info(f'last_model_checkpoint: {last_model_checkpoint}')
可视化训练loss: (这里我们只训练了400个steps)
你也可以使用tensorboard在训练过程中可视化训练loss,输入以下命令:`tensorboard --logdir '{output_dir}/runs'`
images_dir = os.path.join(output_dir, 'images')
logger.info(f'images_dir: {images_dir}')
plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9) # 保存训练loss图
训练后推理
导入一些包:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from swift.llm import (
InferEngine, InferRequest, PtEngine, RequestConfig, get_template, load_dataset, load_image
)
from swift.tuners import Swift
from swift.utils import get_model_parameter_info, get_logger, seed_everything
logger = get_logger()
seed_everything(42)
推理超参数设置:
last_model_checkpoint = 'output/InternVL2_5-2B/vx-xxx/checkpoint-xxx'
# model
model_id_or_path = 'OpenGVLab/InternVL2_5-2B' # model_id or model_path
# dataset
dataset = ['AI-ModelScope/LaTeX_OCR#20000']
data_seed = 42
split_dataset_ratio = 0.01
num_proc = 4
strict = False
# generation_config
max_new_tokens = 512
temperature = 0
我们使用infer_backend 'pt'来对训练后的模型进行推理,如果要使用vllm/lmdeploy进行加速,可以参考:https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_mllm.py
engine = PtEngine(model)
engine.model = Swift.from_pretrained(engine.model, last_model_checkpoint)
engine.model.requires_grad_() # 修复peft将embedding层的requires_grad设置为True
template = get_template(engine.model.model_meta.template, engine.tokenizer)
model_parameter_info = get_model_parameter_info(engine.model)
logger.info(f'model_parameter_info: {model_parameter_info}')
获取验证集:
# 由于设置了data_seed,这里的验证集即为训练时的验证集
_, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc,
strict=strict, seed=data_seed)
val_dataset = val_dataset.select(range(10)) # 取前10条
流式推理,并保存验证集中的图片:
def infer_stream(engine: InferEngine, infer_request: InferRequest):
request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature, stream=True)
gen = engine.infer([infer_request], request_config)
query = infer_request.messages[0]['content']
print(f'query: {query}\nresponse: ', end='')
for resp_list in gen:
print(resp_list[0].choices[0].delta.content, end='', flush=True)
print()
os.makedirs('images', exist_ok=True)
for i, data in enumerate(val_dataset):
image = load_image(data['images'][0]['bytes'])
image.save(f'images/{i}.png')
infer_stream(engine, InferRequest(**data))
print('-' * 50)
推理效果:
点击链接查看原文:ModelScope 魔搭社区
所有评论(0)