mirror of
https://github.com/deepmodeling/Uni-Lab-OS
synced 2026-04-25 17:02:21 +00:00
update workbench example
update aksk desc print res query logs Fix skills exec error with action type Update Skills Update Skills addr Change uni-lab. to leap-lab. Support unit in pylabrobot Support async func. change to leap-lab backend. Support feedback interval. Reduce cocurrent lags. fix create_resource_with_slot update unilabos_formulation & batch-submit-exp scale multi exec thread up to 48 update handle creation api fit cocurrent gap add running status debounce allow non @topic_config support update skill add placeholder keys always free 提交实验技能 disable samples correct sample demo ret value 新增试剂reagent update registry 新增manual_confirm add workstation creation skill add virtual_sample_demo 样品追踪测试设备 add external devices param fix registry upload missing type fast registry load minor fix on skill & registry stripe ros2 schema desc add create-device-skill new registry system backwards to yaml remove not exist resource new registry sys exp. support with add device correct raise create resource error ret info fix revert ret info fix fix prcxi check add create_resource schema re signal host ready event add websocket connection timeout and improve reconnection logic add open_timeout parameter to websocket connection add TimeoutError and InvalidStatus exception handling implement exponential backoff for reconnection attempts simplify reconnection logic flow
This commit is contained in:
@@ -233,7 +233,7 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--addr",
|
||||
type=str,
|
||||
default="https://uni-lab.bohrium.com/api/v1",
|
||||
default="https://leap-lab.bohrium.com/api/v1",
|
||||
help="Laboratory backend address",
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -264,6 +264,12 @@ def parse_args():
|
||||
default=False,
|
||||
help="Test mode: all actions simulate execution and return mock results without running real hardware",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--external_devices_only",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Only load external device packages (--devices), skip built-in unilabos/devices/ scanning and YAML device registry",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extra_resource",
|
||||
action="store_true",
|
||||
@@ -342,11 +348,18 @@ def main():
|
||||
check_mode = args_dict.get("check_mode", False)
|
||||
|
||||
if not skip_env_check:
|
||||
from unilabos.utils.environment_check import check_environment
|
||||
from unilabos.utils.environment_check import check_environment, check_device_package_requirements
|
||||
|
||||
if not check_environment(auto_install=True):
|
||||
print_status("环境检查失败,程序退出", "error")
|
||||
os._exit(1)
|
||||
|
||||
# 第一次设备包依赖检查:build_registry 之前,确保 import map 可用
|
||||
devices_dirs_for_req = args_dict.get("devices", None)
|
||||
if devices_dirs_for_req:
|
||||
if not check_device_package_requirements(devices_dirs_for_req):
|
||||
print_status("设备包依赖检查失败,程序退出", "error")
|
||||
os._exit(1)
|
||||
else:
|
||||
print_status("跳过环境依赖检查", "warning")
|
||||
|
||||
@@ -425,10 +438,10 @@ def main():
|
||||
if args.addr != parser.get_default("addr"):
|
||||
if args.addr == "test":
|
||||
print_status("使用测试环境地址", "info")
|
||||
HTTPConfig.remote_addr = "https://uni-lab.test.bohrium.com/api/v1"
|
||||
HTTPConfig.remote_addr = "https://leap-lab.test.bohrium.com/api/v1"
|
||||
elif args.addr == "uat":
|
||||
print_status("使用uat环境地址", "info")
|
||||
HTTPConfig.remote_addr = "https://uni-lab.uat.bohrium.com/api/v1"
|
||||
HTTPConfig.remote_addr = "https://leap-lab.uat.bohrium.com/api/v1"
|
||||
elif args.addr == "local":
|
||||
print_status("使用本地环境地址", "info")
|
||||
HTTPConfig.remote_addr = "http://127.0.0.1:48197/api/v1"
|
||||
@@ -477,19 +490,7 @@ def main():
|
||||
BasicConfig.vis_2d_enable = args_dict["2d_vis"]
|
||||
BasicConfig.check_mode = check_mode
|
||||
|
||||
from unilabos.resources.graphio import (
|
||||
read_node_link_json,
|
||||
read_graphml,
|
||||
dict_from_graph,
|
||||
)
|
||||
from unilabos.app.communication import get_communication_client
|
||||
from unilabos.registry.registry import build_registry
|
||||
from unilabos.app.backend import start_backend
|
||||
from unilabos.app.web import http_client
|
||||
from unilabos.app.web import start_server
|
||||
from unilabos.app.register import register_devices_and_resources
|
||||
from unilabos.resources.graphio import modify_to_backend_format
|
||||
from unilabos.resources.resource_tracker import ResourceTreeSet, ResourceDict
|
||||
|
||||
# 显示启动横幅
|
||||
print_unilab_banner(args_dict)
|
||||
@@ -498,12 +499,14 @@ def main():
|
||||
# check_mode 和 upload_registry 都会执行实际 import 验证
|
||||
devices_dirs = args_dict.get("devices", None)
|
||||
complete_registry = args_dict.get("complete_registry", False) or check_mode
|
||||
external_only = args_dict.get("external_devices_only", False)
|
||||
lab_registry = build_registry(
|
||||
registry_paths=args_dict["registry_path"],
|
||||
devices_dirs=devices_dirs,
|
||||
upload_registry=BasicConfig.upload_registry,
|
||||
check_mode=check_mode,
|
||||
complete_registry=complete_registry,
|
||||
external_only=external_only,
|
||||
)
|
||||
|
||||
# Check mode: 注册表验证完成后直接退出
|
||||
@@ -513,6 +516,20 @@ def main():
|
||||
print_status(f"Check mode: 注册表验证完成 ({device_count} 设备, {resource_count} 资源),退出", "info")
|
||||
os._exit(0)
|
||||
|
||||
# 以下导入依赖 ROS2 环境,check_mode 已退出不需要
|
||||
from unilabos.resources.graphio import (
|
||||
read_node_link_json,
|
||||
read_graphml,
|
||||
dict_from_graph,
|
||||
modify_to_backend_format,
|
||||
)
|
||||
from unilabos.app.communication import get_communication_client
|
||||
from unilabos.app.backend import start_backend
|
||||
from unilabos.app.web import http_client
|
||||
from unilabos.app.web import start_server
|
||||
from unilabos.app.register import register_devices_and_resources
|
||||
from unilabos.resources.resource_tracker import ResourceTreeSet, ResourceDict
|
||||
|
||||
# Step 1: 上传全部注册表到服务端,同步保存到 unilabos_data
|
||||
if BasicConfig.upload_registry:
|
||||
if BasicConfig.ak and BasicConfig.sk:
|
||||
@@ -536,7 +553,7 @@ def main():
|
||||
os._exit(0)
|
||||
|
||||
if not BasicConfig.ak or not BasicConfig.sk:
|
||||
print_status("后续运行必须拥有一个实验室,请前往 https://uni-lab.bohrium.com 注册实验室!", "warning")
|
||||
print_status("后续运行必须拥有一个实验室,请前往 https://leap-lab.bohrium.com 注册实验室!", "warning")
|
||||
os._exit(1)
|
||||
graph: nx.Graph
|
||||
resource_tree_set: ResourceTreeSet
|
||||
@@ -610,6 +627,10 @@ def main():
|
||||
resource_tree_set.merge_remote_resources(remote_tree_set)
|
||||
print_status("远端物料同步完成", "info")
|
||||
|
||||
# 第二次设备包依赖检查:云端物料同步后,community 包可能引入新的 requirements
|
||||
# TODO: 当 community device package 功能上线后,在这里调用
|
||||
# install_requirements_txt(community_pkg_path / "requirements.txt", label="community.xxx")
|
||||
|
||||
# 使用 ResourceTreeSet 代替 list
|
||||
args_dict["resources_config"] = resource_tree_set
|
||||
args_dict["devices_config"] = resource_tree_set
|
||||
|
||||
@@ -36,6 +36,9 @@ class HTTPClient:
|
||||
auth_secret = BasicConfig.auth_secret()
|
||||
self.auth = auth_secret
|
||||
info(f"正在使用ak sk作为授权信息:[{auth_secret}]")
|
||||
# 复用 TCP/TLS 连接,避免每次请求重新握手
|
||||
self._session = requests.Session()
|
||||
self._session.headers.update({"Authorization": f"Lab {self.auth}"})
|
||||
info(f"HTTPClient 初始化完成: remote_addr={self.remote_addr}")
|
||||
|
||||
def resource_edge_add(self, resources: List[Dict[str, Any]]) -> requests.Response:
|
||||
@@ -48,7 +51,7 @@ class HTTPClient:
|
||||
Returns:
|
||||
Response: API响应对象
|
||||
"""
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/edge/material/edge",
|
||||
json={
|
||||
"edges": resources,
|
||||
@@ -75,25 +78,28 @@ class HTTPClient:
|
||||
Returns:
|
||||
Dict[str, str]: 旧UUID到新UUID的映射关系 {old_uuid: new_uuid}
|
||||
"""
|
||||
with open(os.path.join(BasicConfig.working_dir, "req_resource_tree_add.json"), "w", encoding="utf-8") as f:
|
||||
payload = {"nodes": [x for xs in resources.dump() for x in xs], "mount_uuid": mount_uuid}
|
||||
f.write(json.dumps(payload, indent=4))
|
||||
# 从序列化数据中提取所有节点的UUID(保存旧UUID)
|
||||
# dump() 只调用一次,复用给文件保存和 HTTP 请求
|
||||
nodes_info = [x for xs in resources.dump() for x in xs]
|
||||
old_uuids = {n.res_content.uuid: n for n in resources.all_nodes}
|
||||
payload = {"nodes": nodes_info, "mount_uuid": mount_uuid}
|
||||
body_bytes = _fast_dumps(payload)
|
||||
with open(os.path.join(BasicConfig.working_dir, "req_resource_tree_add.json"), "wb") as f:
|
||||
f.write(_fast_dumps_pretty(payload))
|
||||
http_headers = {"Content-Type": "application/json"}
|
||||
if not self.initialized or first_add:
|
||||
self.initialized = True
|
||||
info(f"首次添加资源,当前远程地址: {self.remote_addr}")
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/edge/material",
|
||||
json={"nodes": [x for xs in resources.dump() for x in xs], "mount_uuid": mount_uuid},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
data=body_bytes,
|
||||
headers=http_headers,
|
||||
timeout=60,
|
||||
)
|
||||
else:
|
||||
response = requests.put(
|
||||
response = self._session.put(
|
||||
f"{self.remote_addr}/edge/material",
|
||||
json={"nodes": [x for xs in resources.dump() for x in xs], "mount_uuid": mount_uuid},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
data=body_bytes,
|
||||
headers=http_headers,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
@@ -111,6 +117,7 @@ class HTTPClient:
|
||||
uuid_mapping[i["uuid"]] = i["cloud_uuid"]
|
||||
else:
|
||||
logger.error(f"添加物料失败: {response.text}")
|
||||
logger.trace(f"添加物料失败: {nodes_info}")
|
||||
for u, n in old_uuids.items():
|
||||
if u in uuid_mapping:
|
||||
n.res_content.uuid = uuid_mapping[u]
|
||||
@@ -131,7 +138,7 @@ class HTTPClient:
|
||||
"""
|
||||
with open(os.path.join(BasicConfig.working_dir, "req_resource_tree_get.json"), "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps({"uuids": uuid_list, "with_children": with_children}, indent=4))
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/edge/material/query",
|
||||
json={"uuids": uuid_list, "with_children": with_children},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
@@ -145,6 +152,7 @@ class HTTPClient:
|
||||
logger.error(f"查询物料失败: {response.text}")
|
||||
else:
|
||||
data = res["data"]["nodes"]
|
||||
logger.trace(f"resource_tree_get查询到物料: {data}")
|
||||
return data
|
||||
else:
|
||||
logger.error(f"查询物料失败: {response.text}")
|
||||
@@ -162,14 +170,14 @@ class HTTPClient:
|
||||
if not self.initialized:
|
||||
self.initialized = True
|
||||
info(f"首次添加资源,当前远程地址: {self.remote_addr}")
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/lab/material",
|
||||
json={"nodes": resources},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
timeout=100,
|
||||
)
|
||||
else:
|
||||
response = requests.put(
|
||||
response = self._session.put(
|
||||
f"{self.remote_addr}/lab/material",
|
||||
json={"nodes": resources},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
@@ -196,7 +204,7 @@ class HTTPClient:
|
||||
"""
|
||||
with open(os.path.join(BasicConfig.working_dir, "req_resource_get.json"), "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps({"id": id, "with_children": with_children}, indent=4))
|
||||
response = requests.get(
|
||||
response = self._session.get(
|
||||
f"{self.remote_addr}/lab/material",
|
||||
params={"id": id, "with_children": with_children},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
@@ -237,14 +245,14 @@ class HTTPClient:
|
||||
if not self.initialized:
|
||||
self.initialized = True
|
||||
info(f"首次添加资源,当前远程地址: {self.remote_addr}")
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/lab/material",
|
||||
json={"nodes": resources},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
timeout=100,
|
||||
)
|
||||
else:
|
||||
response = requests.put(
|
||||
response = self._session.put(
|
||||
f"{self.remote_addr}/lab/material",
|
||||
json={"nodes": resources},
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
@@ -274,7 +282,7 @@ class HTTPClient:
|
||||
with open(file_path, "rb") as file:
|
||||
files = {"files": file}
|
||||
logger.info(f"上传文件: {file_path} 到 {scene}")
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/api/account/file_upload/{scene}",
|
||||
files=files,
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
@@ -314,7 +322,7 @@ class HTTPClient:
|
||||
"Content-Type": "application/json",
|
||||
"Content-Encoding": "gzip",
|
||||
}
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/lab/resource",
|
||||
data=compressed_body,
|
||||
headers=headers,
|
||||
@@ -348,7 +356,7 @@ class HTTPClient:
|
||||
Returns:
|
||||
Response: API响应对象
|
||||
"""
|
||||
response = requests.get(
|
||||
response = self._session.get(
|
||||
f"{self.remote_addr}/edge/material/download",
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
timeout=(3, 30),
|
||||
@@ -409,7 +417,7 @@ class HTTPClient:
|
||||
with open(os.path.join(BasicConfig.working_dir, "req_workflow_upload.json"), "w", encoding="utf-8") as f:
|
||||
f.write(json.dumps(payload, indent=4, ensure_ascii=False))
|
||||
|
||||
response = requests.post(
|
||||
response = self._session.post(
|
||||
f"{self.remote_addr}/lab/workflow/owner/import",
|
||||
json=payload,
|
||||
headers={"Authorization": f"Lab {self.auth}"},
|
||||
|
||||
@@ -754,6 +754,32 @@ class MessageProcessor:
|
||||
req = JobAddReq(**data)
|
||||
|
||||
job_log = format_job_log(req.job_id, req.task_id, req.device_id, req.action)
|
||||
|
||||
# 服务端对always_free动作可能跳过query_action_state直接发job_start,
|
||||
# 此时job尚未注册,需要自动补注册
|
||||
existing_job = self.device_manager.get_job_info(req.job_id)
|
||||
if not existing_job:
|
||||
action_name = req.action
|
||||
device_action_key = f"/devices/{req.device_id}/{action_name}"
|
||||
action_always_free = self._check_action_always_free(req.device_id, action_name)
|
||||
|
||||
if action_always_free:
|
||||
job_info = JobInfo(
|
||||
job_id=req.job_id,
|
||||
task_id=req.task_id,
|
||||
device_id=req.device_id,
|
||||
action_name=action_name,
|
||||
device_action_key=device_action_key,
|
||||
status=JobStatus.QUEUE,
|
||||
start_time=time.time(),
|
||||
always_free=True,
|
||||
)
|
||||
self.device_manager.add_queue_request(job_info)
|
||||
logger.info(f"[MessageProcessor] Job {job_log} always_free, auto-registered from direct job_start")
|
||||
else:
|
||||
logger.error(f"[MessageProcessor] Job {job_log} not registered (missing query_action_state)")
|
||||
return
|
||||
|
||||
success = self.device_manager.start_job(req.job_id)
|
||||
if not success:
|
||||
logger.error(f"[MessageProcessor] Failed to start job {job_log}")
|
||||
@@ -1087,7 +1113,7 @@ class MessageProcessor:
|
||||
"task_id": task_id,
|
||||
"job_id": job_id,
|
||||
"free": free,
|
||||
"need_more": need_more,
|
||||
"need_more": need_more + 1,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -1227,7 +1253,7 @@ class QueueProcessor:
|
||||
"task_id": job_info.task_id,
|
||||
"job_id": job_info.job_id,
|
||||
"free": False,
|
||||
"need_more": 10,
|
||||
"need_more": 10 + 1,
|
||||
},
|
||||
}
|
||||
self.message_processor.send_message(message)
|
||||
@@ -1243,7 +1269,13 @@ class QueueProcessor:
|
||||
if not queued_jobs:
|
||||
return
|
||||
|
||||
logger.debug(f"[QueueProcessor] Sending busy status for {len(queued_jobs)} queued jobs")
|
||||
queue_summary = {}
|
||||
for j in queued_jobs:
|
||||
key = f"{j.device_id}/{j.action_name}"
|
||||
queue_summary[key] = queue_summary.get(key, 0) + 1
|
||||
logger.debug(
|
||||
f"[QueueProcessor] Sending busy status for {len(queued_jobs)} queued jobs: {queue_summary}"
|
||||
)
|
||||
|
||||
for job_info in queued_jobs:
|
||||
# 快照可能已过期:在遍历过程中 end_job() 可能已将此 job 移至 READY,
|
||||
@@ -1260,7 +1292,7 @@ class QueueProcessor:
|
||||
"task_id": job_info.task_id,
|
||||
"job_id": job_info.job_id,
|
||||
"free": False,
|
||||
"need_more": 10,
|
||||
"need_more": 10 + 1,
|
||||
},
|
||||
}
|
||||
success = self.message_processor.send_message(message)
|
||||
@@ -1343,6 +1375,10 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
self.message_processor = MessageProcessor(self.websocket_url, self.send_queue, self.device_manager)
|
||||
self.queue_processor = QueueProcessor(self.device_manager, self.message_processor)
|
||||
|
||||
# running状态debounce缓存: {job_id: (last_send_timestamp, last_feedback_data)}
|
||||
self._job_running_last_sent: Dict[str, tuple] = {}
|
||||
self._job_running_debounce_interval: float = 10.0 # 秒
|
||||
|
||||
# 设置相互引用
|
||||
self.message_processor.set_queue_processor(self.queue_processor)
|
||||
self.message_processor.set_websocket_client(self)
|
||||
@@ -1442,22 +1478,32 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
logger.debug(f"[WebSocketClient] Not connected, cannot publish job status for job_id: {item.job_id}")
|
||||
return
|
||||
|
||||
job_log = format_job_log(item.job_id, item.task_id, item.device_id, item.action_name)
|
||||
|
||||
# 拦截最终结果状态,与原版本逻辑一致
|
||||
if status in ["success", "failed"]:
|
||||
self._job_running_last_sent.pop(item.job_id, None)
|
||||
|
||||
host_node = HostNode.get_instance(0)
|
||||
if host_node:
|
||||
# 从HostNode的device_action_status中移除job_id
|
||||
try:
|
||||
host_node._device_action_status[item.device_action_key].job_ids.pop(item.job_id, None)
|
||||
except (KeyError, AttributeError):
|
||||
logger.warning(f"[WebSocketClient] Failed to remove job {item.job_id} from HostNode status")
|
||||
|
||||
# logger.debug(f"[WebSocketClient] Intercepting final status for job_id: {item.job_id} - {status}")
|
||||
|
||||
# 通知队列处理器job完成(包括timeout的job)
|
||||
self.queue_processor.handle_job_completed(item.job_id, status)
|
||||
|
||||
# 发送job状态消息
|
||||
# running状态按job_id做debounce,内容变化时仍然上报
|
||||
if status == "running":
|
||||
now = time.time()
|
||||
cached = self._job_running_last_sent.get(item.job_id)
|
||||
if cached is not None:
|
||||
last_ts, last_data = cached
|
||||
if now - last_ts < self._job_running_debounce_interval and last_data == feedback_data:
|
||||
logger.trace(f"[WebSocketClient] Job status debounced (skip): {job_log} - {status}")
|
||||
return
|
||||
self._job_running_last_sent[item.job_id] = (now, feedback_data)
|
||||
|
||||
message = {
|
||||
"action": "job_status",
|
||||
"data": {
|
||||
@@ -1473,7 +1519,6 @@ class WebSocketClient(BaseCommunicationClient):
|
||||
}
|
||||
self.message_processor.send_message(message)
|
||||
|
||||
job_log = format_job_log(item.job_id, item.task_id, item.device_id, item.action_name)
|
||||
logger.trace(f"[WebSocketClient] Job status published: {job_log} - {status}")
|
||||
|
||||
def send_ping(self, ping_id: str, timestamp: float) -> None:
|
||||
|
||||
Reference in New Issue
Block a user