AI开发平台MODELARTS-自定义镜像规范:“train.py”示例
“train.py”示例
变量名称 |
说明 |
示例 |
---|---|---|
ENV_AG_MODEL_DIR |
模型存放路径,AI Gallery的模型仓库地址,包含模型仓库的所有文件。 |
“/home/ma-user/.cache/gallery/model/ur12345--gpt2” |
ENV_AG_DATASET_DIR |
数据集存放路径,AI Gallery的数据集仓库地址,包含数据集仓库的所有文件。 |
“/home/ma-user/.cache/gallery/dataset/ur12345--data_demo” |
ENV_AG_USER_PA RAM S |
配置的训练超参json字符串。创建训练任务时在算法配置页面设置的超参,用json字符串表示。 |
{"per_device_eval_batch_size":"32","lr":"0.001","logging_steps":"24"} |
ENV_AG_TRAIN_OUTPUT_DIR |
训练产物文件存放路径。训练产物将被保存到该路径。训练任务结束后,由AI Gallery平台将该目录上传到新模型的仓库中。 |
“/home/ma-user/.cache/gallery/output” |
训练数据的日志文件存放路径。训练过程中的迭代次数、LOSS和吞吐数据按照“迭代次数|loss|吞吐”格式记录在日志中,AI Gallery通过环境变量找到日志,从中获取实际数据绘制成“吞吐”和“训练LOSS”曲线,呈现在训练的“指标效果”中。具体请参见查看训练效果。 说明:
日志文件中的迭代次数、LOSS和吞吐数据必须按照“迭代次数|loss|吞吐”格式存放,否则AI Gallery会数据解析失败,导致“吞吐”和“训练LOSS”曲线异常。 |
“/var/logs/user_metrics.log” |
import json import os from datasets import load_dataset from transformers import AutoImageProcessor from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor, RandomHorizontalFlip import numpy as np from transformers import AutoModelForImageClassification, TrainingArguments, Trainer from transformers import DefaultDataCollator from sklearn import metrics # 环境变量 # 工作目录 ENV_AG_WORK_DIR = 'ENV_AG_WORK_DIR' # 模型存放路径 ENV_AG_MODEL_DIR = 'ENV_AG_MODEL_DIR' # 数据集存放路径 ENV_AG_DATASET_DIR = 'ENV_AG_DATASET_DIR' # 配置的训练超参json字符串 ENV_AG_USER_PARAMS = 'ENV_AG_USER_PARAMS' # 训练产物存放路径 ENV_AG_TRAIN_OUTPUT_DIR = 'ENV_AG_TRAIN_OUTPUT_DIR' _transforms = None def _multi_class_classification_metrics(pred): raw_predictions, labels = pred predictions = np.argmax(raw_predictions, axis=1) results = { "f1_macro": metrics.f1_score(labels, predictions, average="macro"), "f1_micro": metrics.f1_score(labels, predictions, average="micro"), "f1_weighted": metrics.f1_score(labels, predictions, average="weighted"), "precision_macro": metrics.precision_score(labels, predictions, average="macro"), "precision_micro": metrics.precision_score(labels, predictions, average="micro"), "precision_weighted": metrics.precision_score(labels, predictions, average="weighted"), "recall_macro": metrics.recall_score(labels, predictions, average="macro"), "recall_micro": metrics.recall_score(labels, predictions, average="micro"), "recall_weighted": metrics.recall_score(labels, predictions, average="weighted"), "accuracy": metrics.accuracy_score(labels, predictions), } return results def parse_args(): """ 从AIGallery环境变量中获取用户配置的超参json """ return json.loads(os.getenv(ENV_AG_USER_PARAMS)) def _process_input_data(image_processor): # 加载数据集 dataset_path = os.getenv(ENV_AG_DATASET_DIR) dataset = load_dataset("imagefolder", data_dir=dataset_path) # 数据增强 normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std) size = (image_processor.size["shortest_edge"] if "shortest_edge" in image_processor.size else ( image_processor.size["height"], image_processor.size["width"])) global _transforms _transforms = Compose([RandomResizedCrop(size), RandomHorizontalFlip(), ToTensor(), normalize]) ret = dataset.with_transform(_format_transforms) return ret # 转换函数 def _format_transforms(examples): examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]] del examples["image"] return examples def train(user_args): print('Start to process dataset') model_path = os.getenv(ENV_AG_MODEL_DIR) image_processor = AutoImageProcessor.from_pretrained(model_path) dataset = _process_input_data(image_processor) print(f"Dataset: {dataset}") # label和id映射 classes = dataset["train"].features["label"].names label2id = {c: i for i, c in enumerate(classes)} id2label = {i: c for i, c in enumerate(classes)} print('Start to load model') # 加载模型 model = AutoModelForImageClassification.from_pretrained( model_path, num_labels=len(classes), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True ) print('Start to set training args') # 训练参数 training_args = TrainingArguments( output_dir=os.getenv(ENV_AG_TRAIN_OUTPUT_DIR), remove_unused_columns=False, evaluation_strategy="epoch", save_strategy=user_args['save_strategy'], learning_rate=float(user_args['lr']), save_total_limit=3, per_device_train_batch_size=32, gradient_accumulation_steps=1, per_device_eval_batch_size=int(user_args['per_device_eval_batch_size']), num_train_epochs=int(user_args['num_train_epochs']), warmup_ratio=float(user_args['warmup_ratio']), logging_steps=int(user_args['logging_steps']), load_best_model_at_end=True, metric_for_best_model="accuracy", push_to_hub=False, ) print('Start to train') # 训练参数 trainer = Trainer( model=model, args=training_args, data_collator=DefaultDataCollator(), train_dataset=dataset["train"], eval_dataset=dataset["test"], tokenizer=image_processor, compute_metrics=_multi_class_classification_metrics, ) # 开始训练 train_results = trainer.train() print('Start to save model') # 保存模型 trainer.save_model() trainer.log_metrics("train", train_results.metrics) trainer.save_metrics("train", train_results.metrics) trainer.save_state() print('Start to evaluate') # 在验证集上做准确性评估 eva_metrics = trainer.evaluate() trainer.log_metrics("eval", eva_metrics) trainer.save_metrics("eval", eva_metrics) print('All Done') if __name__ == '__main__': args = parse_args() train(args)
- ModelArts推理部署_创建AI应用_自定义镜像规范-华为云
- ModelArts自定义镜像_自定义镜像简介_如何使用自定义镜像
- ModelArts推理部署_模型_AI应用来源-华为云
- ModelArts推理部署_OBS导入_模型包规范-华为云
- ModelArts推理部署_服务_访问公网-华为云
- ModelArts模型训练_模型训练简介_如何训练模型
- ModelArts模型训练_创建训练作业_如何创建训练作业
- ModelArts分布式训练_分布式训练介绍_分布式调测
- 华为云ModelArts_ModelArts开发_AI全流程开发
- ModelArts推理部署_纳管Atlas 500_边缘服务-华为云