AI开发平台MODELARTS-编写Workflow

时间：2024-04-25 11:39:41

AI开发平台MODELARTS

编写Workflow

基于图像分类算法，构建包含训练单节点的Workflow。

确保安装开发环境完成后，在ModelArts的Notebook环境中，通过JupyterLab输入如下示例代码。

from modelarts import workflow as wf

# 定义统一存储对象管理输出目录
output_storage = wf.data.OutputStorage(name="output_storage", description="输出目录统一配置")

# 数据集对象
dataset = wf.data.DatasetPlaceholder(name="input_data")

# 创建训练作业
job_step = wf.steps.JobStep(
    name="training_job",
    title="图像分类训练",
    algorithm=wf.AIGalleryAlgorithm(
        subscription_id="***", # 图像分类算法的订阅ID，自行前往算法管理页面进行查看，可选参数，此处以订阅算法举例
        item_version_id="10.0.0", # 订阅算法的版本号，该示例为10.0.0版本，可选参数，此处以订阅算法举例
        parameters=[
            wf.AlgorithmParameters(name="task_type", value="image_classification_v2"),
            wf.AlgorithmParameters(name="model_name", value="resnet_v1_50"),
            wf.AlgorithmParameters(name="do_train", value="True"),
            wf.AlgorithmParameters(name="do_eval_along_train", value="True"),
            wf.AlgorithmParameters(name="variable_update", value="horovod"),
            wf.AlgorithmParameters(name="learning_rate_strategy", value=wf.Placeholder(name="learning_rate_strategy", placeholder_type=wf.PlaceholderType.STR, default="0.002", description="训练的学习率策略(10:0.001,20:0.0001代表0-10个epoch学习率0.001，10-20epoch学习率0.0001),如果不指定epoch, 会根据验证精度情况自动调整学习率，并当精度没有明显提升时，训练停止")),
            wf.AlgorithmParameters(name="batch_size", value=wf.Placeholder(name="batch_size", placeholder_type=wf.PlaceholderType.INT, default=64, description="每步训练的图片数量（单卡）")),
            wf.AlgorithmParameters(name="eval_batch_size", value=wf.Placeholder(name="eval_batch_size", placeholder_type=wf.PlaceholderType.INT, default=64, description="每步验证的图片数量（单卡）")),
            wf.AlgorithmParameters(name="evaluate_every_n_epochs", value=wf.Placeholder(name="evaluate_every_n_epochs", placeholder_type=wf.PlaceholderType.FLOAT, default=1.0, description="每训练n个epoch做一次验证")),
            wf.AlgorithmParameters(name="save_model_secs", value=wf.Placeholder(name="save_model_secs", placeholder_type=wf.PlaceholderType.INT, default=60, description="保存模型的频率（单位：s)")),
            wf.AlgorithmParameters(name="save_summary_steps", value=wf.Placeholder(name="save_summary_steps", placeholder_type=wf.PlaceholderType.INT, default=10, description="保存summary的频率（单位：步)")),
            wf.AlgorithmParameters(name="log_every_n_steps", value=wf.Placeholder(name="log_every_n_steps", placeholder_type=wf.PlaceholderType.INT, default=10, description="打印日志的频率（单位：步)")),
            wf.AlgorithmParameters(name="do_data_cleaning", value=wf.Placeholder(name="do_data_cleaning", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否进行数据清洗, 数据格式异常会导致训练失败，建议开启，保证训练稳定性。数据量过大时，数据清洗可能耗时较久，可自行线下清洗（支持BMP.JPEG,PNG格式, RGB三通道）。建议用JPEG格式数据")),
            wf.AlgorithmParameters(name="use_fp16", value=wf.Placeholder(name="use_fp16", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否使用混合精度, 混合精度可以加速训练，但是可能会造成一点精度损失，如果对精度无极严格的要求，建议开启")),
            wf.AlgorithmParameters(name="xla_compile", value=wf.Placeholder(name="xla_compile", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否开启xla编译，加速训练，默认启用")),
            wf.AlgorithmParameters(name="data_format", value=wf.Placeholder(name="data_format", placeholder_type=wf.PlaceholderType.ENUM, default="NCHW", enum_list=["NCHW", "NHWC"], description="输入数据类型，NHWC表示channel在最后，NCHW表channel在最前，默认值NCHW（速度有提升）")),
            wf.AlgorithmParameters(name="best_model", value=wf.Placeholder(name="best_model", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否在训练过程中保存并使用精度最高的模型，而不是最新的模型。默认值True,保存最优模型。在一定误差范围内，最优模型会保存最新的高精度模型")),
            wf.AlgorithmParameters(name="jpeg_preprocess", value=wf.Placeholder(name="jpeg_preprocess", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否使用jpeg预处理加速算子(仅支持jpeg格式数据)，可加速数据读取，提升性能，默认启用。如果数据格式不是jpeg格式，开启数据清洗功能即可使用"))
        ]
    ),
    inputs=[wf.steps.JobInput(name="data_url", data=dataset)],
    outputs=[wf.steps.JobOutput(name="train_url", obs_config=wf.data.OBSOutputConfig(obs_path=output_storage.join("/train_output/")))],
    spec=wf.steps.JobSpec(
        resource=wf.steps.JobResource(
            flavor=wf.Placeholder(
                name="training_flavor",
                placeholder_type=wf.PlaceholderType.JSON,
                description="训练资源规格"
            )
        )
    )
)

# 构建工作流对象
workflow = wf.Workflow(
    name="image-classification-ResNeSt",
    desc="this is a image classification workflow",
    steps=[job_step],
    storages=[output_storage]
)

# 工作流默认创建在default工作空间下，可以通过以下方式指定工作流归属的空间
# workflow = wf.Workflow(
#     name="image-classification-ResNeSt",
#     desc="this is a image classification workflow",
#     steps=[job_step],
#     storages=[output_storage],
#     workspace=wf.resource.Workspace(workspace_id="***")
# )
# 其中workspace_id可前往ModelArts的工作空间服务中进行查看

上述代码示例在云上Notebook环境中可直接调试运行，如果需要在本地IDE中使用，则需要补充相关的session鉴权内容，代码示例修改如下：

from modelarts import workflow as wf
from modelarts.session import Session
# 认证用的ak和sk硬编码到代码中或者明文存储都有很大的安全风险，建议在配置文件或者环境变量中密文存放，使用时解密，确保安全；
# 本示例以ak和sk保存在环境变量中来实现身份验证为例，运行本示例前请先在本地环境中设置环境变量HUAWEICLOUD_SDK_AK和HUAWEICLOUD_SDK_SK。
__AK = os.environ["HUAWEICLOUD_SDK_AK"]
__SK = os.environ["HUAWEICLOUD_SDK_SK"]
# 如果进行了加密还需要进行解密操作
session = Session(access_key=__AK, secret_key=__SK, project_id='***', region_name='***') # 根据账号的相关信息进行修改

# 定义统一存储对象管理输出目录
output_storage = wf.data.OutputStorage(name="output_storage", description="输出目录统一配置")

# 数据集对象
dataset = wf.data.DatasetPlaceholder(name="input_data")

# 创建训练作业
job_step = wf.steps.JobStep(
    name="training_job",
    title="图像分类训练",
    algorithm=wf.AIGalleryAlgorithm(
        subscription_id="***", # 图像分类算法的订阅ID，自行前往算法管理页面进行查看
        item_version_id="10.0.0", # 订阅算法的版本号，该示例为10.0.0版本
        parameters=[
            wf.AlgorithmParameters(name="task_type", value="image_classification_v2"),
            wf.AlgorithmParameters(name="model_name", value="resnet_v1_50"),
            wf.AlgorithmParameters(name="do_train", value="True"),
            wf.AlgorithmParameters(name="do_eval_along_train", value="True"),
            wf.AlgorithmParameters(name="variable_update", value="horovod"),
            wf.AlgorithmParameters(name="learning_rate_strategy", value=wf.Placeholder(name="learning_rate_strategy", placeholder_type=wf.PlaceholderType.STR, default="0.002", description="训练的学习率策略(10:0.001,20:0.0001代表0-10个epoch学习率0.001，10-20epoch学习率0.0001),如果不指定epoch, 会根据验证精度情况自动调整学习率，并当精度没有明显提升时，训练停止")),
            wf.AlgorithmParameters(name="batch_size", value=wf.Placeholder(name="batch_size", placeholder_type=wf.PlaceholderType.INT, default=64, description="每步训练的图片数量（单卡）")),
            wf.AlgorithmParameters(name="eval_batch_size", value=wf.Placeholder(name="eval_batch_size", placeholder_type=wf.PlaceholderType.INT, default=64, description="每步验证的图片数量（单卡）")),
            wf.AlgorithmParameters(name="evaluate_every_n_epochs", value=wf.Placeholder(name="evaluate_every_n_epochs", placeholder_type=wf.PlaceholderType.FLOAT, default=1.0, description="每训练n个epoch做一次验证")),
            wf.AlgorithmParameters(name="save_model_secs", value=wf.Placeholder(name="save_model_secs", placeholder_type=wf.PlaceholderType.INT, default=60, description="保存模型的频率（单位：s)")),
            wf.AlgorithmParameters(name="save_summary_steps", value=wf.Placeholder(name="save_summary_steps", placeholder_type=wf.PlaceholderType.INT, default=10, description="保存summary的频率（单位：步)")),
            wf.AlgorithmParameters(name="log_every_n_steps", value=wf.Placeholder(name="log_every_n_steps", placeholder_type=wf.PlaceholderType.INT, default=10, description="打印日志的频率（单位：步)")),
            wf.AlgorithmParameters(name="do_data_cleaning", value=wf.Placeholder(name="do_data_cleaning", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否进行数据清洗, 数据格式异常会导致训练失败，建议开启，保证训练稳定性。数据量过大时，数据清洗可能耗时较久，可自行线下清洗（支持BMP.JPEG,PNG格式, RGB三通道）。建议用JPEG格式数据")),
            wf.AlgorithmParameters(name="use_fp16", value=wf.Placeholder(name="use_fp16", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否使用混合精度, 混合精度可以加速训练，但是可能会造成一点精度损失，如果对精度无极严格的要求，建议开启")),
            wf.AlgorithmParameters(name="xla_compile", value=wf.Placeholder(name="xla_compile", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否开启xla编译，加速训练，默认启用")),
            wf.AlgorithmParameters(name="data_format", value=wf.Placeholder(name="data_format", placeholder_type=wf.PlaceholderType.ENUM, default="NCHW", enum_list=["NCHW", "NHWC"], description="输入数据类型，NHWC表示channel在最后，NCHW表channel在最前，默认值NCHW（速度有提升）")),
            wf.AlgorithmParameters(name="best_model", value=wf.Placeholder(name="best_model", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否在训练过程中保存并使用精度最高的模型，而不是最新的模型。默认值True,保存最优模型。在一定误差范围内，最优模型会保存最新的高精度模型")),
            wf.AlgorithmParameters(name="jpeg_preprocess", value=wf.Placeholder(name="jpeg_preprocess", placeholder_type=wf.PlaceholderType.STR, default="True", description="是否使用jpeg预处理加速算子(仅支持jpeg格式数据)，可加速数据读取，提升性能，默认启用。如果数据格式不是jpeg格式，开启数据清洗功能即可使用"))
        ]
    ),
    inputs=[wf.steps.JobInput(name="data_url", data=dataset)],
    outputs=[wf.steps.JobOutput(name="train_url", obs_config=wf.data.OBSOutputConfig(obs_path=output_storage.join("/train_output/")))],
    spec=wf.steps.JobSpec(
        resource=wf.steps.JobResource(
            flavor=wf.Placeholder(
                name="training_flavor",
                placeholder_type=wf.PlaceholderType.JSON,
                description="训练资源规格"
            )
        )
    )
)

# 构建工作流对象
workflow = wf.Workflow(
    name="image-classification-ResNeSt",
    desc="this is a image classification workflow",
    steps=[job_step],
    session=session, # 补充鉴权对象
    storages=[output_storage]
)

# 工作流默认创建在default工作空间下，可以通过以下方式指定工作流归属的空间
# workflow = wf.Workflow(
#     name="image-classification-ResNeSt",
#     desc="this is a image classification workflow",
#     steps=[job_step],
#     session=session, # 补充鉴权对象
#     storages=[output_storage],
#     workspace=wf.resource.Workspace(workspace_id="***")
# )
# 其中workspace_id可前往ModelArts页面的工作空间服务中进行查看

父主题： 开发第一条Workflow

上一篇：AI开发平台MODELARTS-本地IDE连接Notebook:使用本地IDE远程连接Notebook准备环境

下一篇：AI开发平台MODELARTS-发布运行态并执行