附录:config.json文件 config.json文件用于推理服务启动时,需要修改以下参数,2台机器的每个容器中config.json文件内容一致。 ipAddress:主节点IP地址,即rank_table_file.json文件中的server_id。 managementIpAddress:主节点IP地址,和ipAddress取值一致。 httpsEnabled:取值需要修改为false。 interCommTLSEnabled和interNodeTLSEnabled:如果不需要开启安全认证,这2个参数取值需要修改为false。 multiNodesInferEnabled:取值需要修改true,表示开启多机推理。 modelName:设置为DeepSeek-V3或DeepSeek-R1。 modelWeightPath:权重文件在容器内的地址,例如:${container_work_dir}/deepseekV3-w8a8或$${container_work_dir}/deepseekR1-w8a8目录。${container_work_dir}在步骤四:启动容器时定义。 maxPrefillBatchSize:最大prefill batch size。config.json文件中默认是50,并发请求数量超出设置,推理请求会被拒绝。用户可以根据实际修改。maxPrefillBatchSize和maxPrefillTokens谁先达到各自的取值就完成本次组batch。 maxSeqLen:输入长度+输出长度的最大值。该值为maxInputTokenLen+maxIterTimes的和。config.json文件中默认是16k,用户可以根据自己的推理场景设置。 maxInputTokenLen:输入最大长度。config.json文件中默认是15k,用户可以根据自己的推理场景设置。 maxPrefillTokens:最大prefill token数。和maxInputTokenLen保持相同。 maxIterTimes:最大输出长度。config.json文件中默认是1k,用户可以根据自己的推理场景设置。 当前在W8A8量化权重、2台Ascend Snt9B资源下支持的maxSeqLen最大为32768。 {
"Version" : "1.0.0",
"LogConfig" :
{
"logLevel" : "Info",
"logFileSize" : 20,
"logFileNum" : 20,
"logPath" : "logs/mindie-server.log"
},
"ServerConfig" :
{
"ipAddress" : "7.242.110.112",
"managementIpAddress" : "7.242.110.112",
"port" : 1025,
"managementPort" : 1026,
"metricsPort" : 1027,
"allowAllZeroIpListening" : false,
"maxLinkNum" : 1000,
"httpsEnabled" : false,
"fullTextEnabled" : false,
"tlsCaPath" : "security/ca/",
"tlsCaFile" : ["ca.pem"],
"tlsCert" : "security/certs/server.pem",
"tlsPk" : "security/keys/server.key.pem",
"tlsPkPwd" : "security/pass/key_pwd.txt",
"tlsCrlPath" : "security/certs/",
"tlsCrlFiles" : ["server_crl.pem"],
"managementTlsCaFile" : ["management_ca.pem"],
"managementTlsCert" : "security/certs/management/server.pem",
"managementTlsPk" : "security/keys/management/server.key.pem",
"managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
"managementTlsCrlPath" : "security/management/certs/",
"managementTlsCrlFiles" : ["server_crl.pem"],
"kmcKsfMaster" : "tools/pmt/master/ksfa",
"kmcKsfStandby" : "tools/pmt/standby/ksfb",
"inferMode" : "standard",
"interCommTLSEnabled" : false,
"interCommPort" : 1121,
"interCommTlsCaPath" : "security/grpc/ca/",
"interCommTlsCaFiles" : ["ca.pem"],
"interCommTlsCert" : "security/grpc/certs/server.pem",
"interCommPk" : "security/grpc/keys/server.key.pem",
"interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
"interCommTlsCrlPath" : "security/grpc/certs/",
"interCommTlsCrlFiles" : ["server_crl.pem"],
"openAiSupport" : "vllm"
},
"BackendConfig" : {
"backendName" : "mindieservice_llm_engine",
"modelInstanceNumber" : 1,
"npuDeviceIds" : [[0,1,2,3]],
"tokenizerProcessNumber" : 8,
"multiNodesInferEnabled" : true,
"multiNodesInferPort" : 1120,
"interNodeTLSEnabled" : false,
"interNodeTlsCaPath" : "security/grpc/ca/",
"interNodeTlsCaFiles" : ["ca.pem"],
"interNodeTlsCert" : "security/grpc/certs/server.pem",
"interNodeTlsPk" : "security/grpc/keys/server.key.pem",
"interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
"interNodeTlsCrlPath" : "security/grpc/certs/",
"interNodeTlsCrlFiles" : ["server_crl.pem"],
"interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
"interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
"ModelDeployConfig" :
{
"maxSeqLen" : 16384,
"maxInputTokenLen" : 15360,
"truncation" : false,
"ModelConfig" : [
{
"modelInstanceType" : "Standard",
"modelName" : "DeepSeek-V3",
"modelWeightPath" : "/data/model/DeepSeek-V3-w8a8",
"worldSize" : 4,
"cpuMemSize" : 5,
"npuMemSize" : -1,
"backendType" : "atb",
"trustRemoteCode" : false
}
]
},
"ScheduleConfig" :
{
"templateType" : "Standard",
"templateName" : "Standard_LLM",
"cacheBlockSize" : 128,
"maxPrefillBatchSize" : 50,
"maxPrefillTokens" : 15360,
"prefillTimeMsPerReq" : 150,
"prefillPolicyType" : 0,
"decodeTimeMsPerReq" : 50,
"decodePolicyType" : 0,
"maxBatchSize" : 200,
"maxIterTimes" : 1024,
"maxPreemptCount" : 0,
"supportSelectBatch" : false,
"maxQueueDelayMicroseconds" : 5000
}
}
}
父主题: DeepSeek模型基于ModelArts Lite Server适配MindIE推理部署指导
附录:rank_table_file.json文件 rank_table_file.json文件样例如下,需要根据实际修改server_count,device_ip,server_id,container_ip参数,每台机器上的rank_table_file.json文件内容一致。在步骤三:创建rank_table_file.json步骤中会用到。 server_count:节点个数。当前默认为2。 device_ip:当前卡的IP地址,2台机器共16张卡。device_ip查询命令 for i in {0..7};do hccn_tool -i $i -ip -g; done server_id:当前Server节点的IP地址,涉及2台机器。 container_ip:容器IP地址,无特殊配置时与server_id保存一致。 {
"server_count": "2",
"server_list": [
{
"device": [
{
"device_id": "0",
"device_ip": "29.82.85.12",
"rank_id": "0"
},
{
"device_id": "1",
"device_ip": "29.82.98.67",
"rank_id": "1"
},
{
"device_id": "2",
"device_ip": "29.82.133.21",
"rank_id": "2"
},
{
"device_id": "3",
"device_ip": "29.82.175.69",
"rank_id": "3"
},
{
"device_id": "4",
"device_ip": "29.82.13.154",
"rank_id": "4"
},
{
"device_id": "5",
"device_ip": "29.82.140.51",
"rank_id": "5"
},
{
"device_id": "6",
"device_ip": "29.82.157.87",
"rank_id": "6"
},
{
"device_id": "7",
"device_ip": "29.82.15.225",
"rank_id": "7"
}
],
"server_id": "7.242.110.112",
"container_ip": "7.242.110.112"
},
{
"device": [
{
"device_id": "0",
"device_ip": "29.82.177.28",
"rank_id": "8"
},
{
"device_id": "1",
"device_ip": "29.82.41.231",
"rank_id": "9"
},
{
"device_id": "2",
"device_ip": "29.82.16.3",
"rank_id": "10"
},
{
"device_id": "3",
"device_ip": "29.82.154.20",
"rank_id": "11"
},
{
"device_id": "4",
"device_ip": "29.82.56.73",
"rank_id": "12"
},
{
"device_id": "5",
"device_ip": "29.82.177.138",
"rank_id": "13"
},
{
"device_id": "6",
"device_ip": "29.82.29.230",
"rank_id": "14"
},
{
"device_id": "7",
"device_ip": "29.82.1.176",
"rank_id": "15"
}
],
"server_id": "7.242.104.54",
"container_ip": "7.242.104.54"
}
],
"status": "completed",
"version": "1.0"
} 父主题: DeepSeek模型基于ModelArts Lite Server适配MindIE推理部署指导