update workbench example

update aksk desc

print res query logs

Fix skills exec error with action type

Update Skills

Update Skills addr

Change uni-lab. to leap-lab.
Support unit in pylabrobot

Support async func.

change to leap-lab backend. Support feedback interval. Reduce cocurrent lags.

fix create_resource_with_slot

update unilabos_formulation & batch-submit-exp

scale multi exec thread up to 48

update handle creation api

fit cocurrent gap

add running status debounce

allow non @topic_config support

update skill

add placeholder keys

always free

提交实验技能

disable samples

correct sample demo ret value

新增试剂reagent

update registry

新增manual_confirm

add workstation creation skill

add virtual_sample_demo 样品追踪测试设备

add external devices param
fix registry upload missing type

fast registry load

minor fix on skill & registry

stripe ros2 schema desc
add create-device-skill

new registry system backwards to yaml

remove not exist resource

new registry sys
exp. support with add device

correct raise create resource error

ret info fix revert

ret info fix

fix prcxi check

add create_resource schema

re signal host ready event

add websocket connection timeout and improve reconnection logic

add open_timeout parameter to websocket connection
add TimeoutError and InvalidStatus exception handling
implement exponential backoff for reconnection attempts
simplify reconnection logic flow
This commit is contained in:
Xuwznln
2026-03-07 04:40:34 +08:00
parent c001f6a151
commit ccbf5378dd
58 changed files with 6818 additions and 489 deletions

View File

@@ -233,7 +233,7 @@ def parse_args():
parser.add_argument(
"--addr",
type=str,
default="https://uni-lab.bohrium.com/api/v1",
default="https://leap-lab.bohrium.com/api/v1",
help="Laboratory backend address",
)
parser.add_argument(
@@ -264,6 +264,12 @@ def parse_args():
default=False,
help="Test mode: all actions simulate execution and return mock results without running real hardware",
)
parser.add_argument(
"--external_devices_only",
action="store_true",
default=False,
help="Only load external device packages (--devices), skip built-in unilabos/devices/ scanning and YAML device registry",
)
parser.add_argument(
"--extra_resource",
action="store_true",
@@ -342,11 +348,18 @@ def main():
check_mode = args_dict.get("check_mode", False)
if not skip_env_check:
from unilabos.utils.environment_check import check_environment
from unilabos.utils.environment_check import check_environment, check_device_package_requirements
if not check_environment(auto_install=True):
print_status("环境检查失败,程序退出", "error")
os._exit(1)
# 第一次设备包依赖检查build_registry 之前,确保 import map 可用
devices_dirs_for_req = args_dict.get("devices", None)
if devices_dirs_for_req:
if not check_device_package_requirements(devices_dirs_for_req):
print_status("设备包依赖检查失败,程序退出", "error")
os._exit(1)
else:
print_status("跳过环境依赖检查", "warning")
@@ -425,10 +438,10 @@ def main():
if args.addr != parser.get_default("addr"):
if args.addr == "test":
print_status("使用测试环境地址", "info")
HTTPConfig.remote_addr = "https://uni-lab.test.bohrium.com/api/v1"
HTTPConfig.remote_addr = "https://leap-lab.test.bohrium.com/api/v1"
elif args.addr == "uat":
print_status("使用uat环境地址", "info")
HTTPConfig.remote_addr = "https://uni-lab.uat.bohrium.com/api/v1"
HTTPConfig.remote_addr = "https://leap-lab.uat.bohrium.com/api/v1"
elif args.addr == "local":
print_status("使用本地环境地址", "info")
HTTPConfig.remote_addr = "http://127.0.0.1:48197/api/v1"
@@ -477,19 +490,7 @@ def main():
BasicConfig.vis_2d_enable = args_dict["2d_vis"]
BasicConfig.check_mode = check_mode
from unilabos.resources.graphio import (
read_node_link_json,
read_graphml,
dict_from_graph,
)
from unilabos.app.communication import get_communication_client
from unilabos.registry.registry import build_registry
from unilabos.app.backend import start_backend
from unilabos.app.web import http_client
from unilabos.app.web import start_server
from unilabos.app.register import register_devices_and_resources
from unilabos.resources.graphio import modify_to_backend_format
from unilabos.resources.resource_tracker import ResourceTreeSet, ResourceDict
# 显示启动横幅
print_unilab_banner(args_dict)
@@ -498,12 +499,14 @@ def main():
# check_mode 和 upload_registry 都会执行实际 import 验证
devices_dirs = args_dict.get("devices", None)
complete_registry = args_dict.get("complete_registry", False) or check_mode
external_only = args_dict.get("external_devices_only", False)
lab_registry = build_registry(
registry_paths=args_dict["registry_path"],
devices_dirs=devices_dirs,
upload_registry=BasicConfig.upload_registry,
check_mode=check_mode,
complete_registry=complete_registry,
external_only=external_only,
)
# Check mode: 注册表验证完成后直接退出
@@ -513,6 +516,20 @@ def main():
print_status(f"Check mode: 注册表验证完成 ({device_count} 设备, {resource_count} 资源),退出", "info")
os._exit(0)
# 以下导入依赖 ROS2 环境check_mode 已退出不需要
from unilabos.resources.graphio import (
read_node_link_json,
read_graphml,
dict_from_graph,
modify_to_backend_format,
)
from unilabos.app.communication import get_communication_client
from unilabos.app.backend import start_backend
from unilabos.app.web import http_client
from unilabos.app.web import start_server
from unilabos.app.register import register_devices_and_resources
from unilabos.resources.resource_tracker import ResourceTreeSet, ResourceDict
# Step 1: 上传全部注册表到服务端,同步保存到 unilabos_data
if BasicConfig.upload_registry:
if BasicConfig.ak and BasicConfig.sk:
@@ -536,7 +553,7 @@ def main():
os._exit(0)
if not BasicConfig.ak or not BasicConfig.sk:
print_status("后续运行必须拥有一个实验室,请前往 https://uni-lab.bohrium.com 注册实验室!", "warning")
print_status("后续运行必须拥有一个实验室,请前往 https://leap-lab.bohrium.com 注册实验室!", "warning")
os._exit(1)
graph: nx.Graph
resource_tree_set: ResourceTreeSet
@@ -610,6 +627,10 @@ def main():
resource_tree_set.merge_remote_resources(remote_tree_set)
print_status("远端物料同步完成", "info")
# 第二次设备包依赖检查云端物料同步后community 包可能引入新的 requirements
# TODO: 当 community device package 功能上线后,在这里调用
# install_requirements_txt(community_pkg_path / "requirements.txt", label="community.xxx")
# 使用 ResourceTreeSet 代替 list
args_dict["resources_config"] = resource_tree_set
args_dict["devices_config"] = resource_tree_set

View File

@@ -36,6 +36,9 @@ class HTTPClient:
auth_secret = BasicConfig.auth_secret()
self.auth = auth_secret
info(f"正在使用ak sk作为授权信息[{auth_secret}]")
# 复用 TCP/TLS 连接,避免每次请求重新握手
self._session = requests.Session()
self._session.headers.update({"Authorization": f"Lab {self.auth}"})
info(f"HTTPClient 初始化完成: remote_addr={self.remote_addr}")
def resource_edge_add(self, resources: List[Dict[str, Any]]) -> requests.Response:
@@ -48,7 +51,7 @@ class HTTPClient:
Returns:
Response: API响应对象
"""
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/edge/material/edge",
json={
"edges": resources,
@@ -75,25 +78,28 @@ class HTTPClient:
Returns:
Dict[str, str]: 旧UUID到新UUID的映射关系 {old_uuid: new_uuid}
"""
with open(os.path.join(BasicConfig.working_dir, "req_resource_tree_add.json"), "w", encoding="utf-8") as f:
payload = {"nodes": [x for xs in resources.dump() for x in xs], "mount_uuid": mount_uuid}
f.write(json.dumps(payload, indent=4))
# 从序列化数据中提取所有节点的UUID保存旧UUID
# dump() 只调用一次,复用给文件保存和 HTTP 请求
nodes_info = [x for xs in resources.dump() for x in xs]
old_uuids = {n.res_content.uuid: n for n in resources.all_nodes}
payload = {"nodes": nodes_info, "mount_uuid": mount_uuid}
body_bytes = _fast_dumps(payload)
with open(os.path.join(BasicConfig.working_dir, "req_resource_tree_add.json"), "wb") as f:
f.write(_fast_dumps_pretty(payload))
http_headers = {"Content-Type": "application/json"}
if not self.initialized or first_add:
self.initialized = True
info(f"首次添加资源,当前远程地址: {self.remote_addr}")
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/edge/material",
json={"nodes": [x for xs in resources.dump() for x in xs], "mount_uuid": mount_uuid},
headers={"Authorization": f"Lab {self.auth}"},
data=body_bytes,
headers=http_headers,
timeout=60,
)
else:
response = requests.put(
response = self._session.put(
f"{self.remote_addr}/edge/material",
json={"nodes": [x for xs in resources.dump() for x in xs], "mount_uuid": mount_uuid},
headers={"Authorization": f"Lab {self.auth}"},
data=body_bytes,
headers=http_headers,
timeout=10,
)
@@ -111,6 +117,7 @@ class HTTPClient:
uuid_mapping[i["uuid"]] = i["cloud_uuid"]
else:
logger.error(f"添加物料失败: {response.text}")
logger.trace(f"添加物料失败: {nodes_info}")
for u, n in old_uuids.items():
if u in uuid_mapping:
n.res_content.uuid = uuid_mapping[u]
@@ -131,7 +138,7 @@ class HTTPClient:
"""
with open(os.path.join(BasicConfig.working_dir, "req_resource_tree_get.json"), "w", encoding="utf-8") as f:
f.write(json.dumps({"uuids": uuid_list, "with_children": with_children}, indent=4))
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/edge/material/query",
json={"uuids": uuid_list, "with_children": with_children},
headers={"Authorization": f"Lab {self.auth}"},
@@ -145,6 +152,7 @@ class HTTPClient:
logger.error(f"查询物料失败: {response.text}")
else:
data = res["data"]["nodes"]
logger.trace(f"resource_tree_get查询到物料: {data}")
return data
else:
logger.error(f"查询物料失败: {response.text}")
@@ -162,14 +170,14 @@ class HTTPClient:
if not self.initialized:
self.initialized = True
info(f"首次添加资源,当前远程地址: {self.remote_addr}")
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/lab/material",
json={"nodes": resources},
headers={"Authorization": f"Lab {self.auth}"},
timeout=100,
)
else:
response = requests.put(
response = self._session.put(
f"{self.remote_addr}/lab/material",
json={"nodes": resources},
headers={"Authorization": f"Lab {self.auth}"},
@@ -196,7 +204,7 @@ class HTTPClient:
"""
with open(os.path.join(BasicConfig.working_dir, "req_resource_get.json"), "w", encoding="utf-8") as f:
f.write(json.dumps({"id": id, "with_children": with_children}, indent=4))
response = requests.get(
response = self._session.get(
f"{self.remote_addr}/lab/material",
params={"id": id, "with_children": with_children},
headers={"Authorization": f"Lab {self.auth}"},
@@ -237,14 +245,14 @@ class HTTPClient:
if not self.initialized:
self.initialized = True
info(f"首次添加资源,当前远程地址: {self.remote_addr}")
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/lab/material",
json={"nodes": resources},
headers={"Authorization": f"Lab {self.auth}"},
timeout=100,
)
else:
response = requests.put(
response = self._session.put(
f"{self.remote_addr}/lab/material",
json={"nodes": resources},
headers={"Authorization": f"Lab {self.auth}"},
@@ -274,7 +282,7 @@ class HTTPClient:
with open(file_path, "rb") as file:
files = {"files": file}
logger.info(f"上传文件: {file_path}{scene}")
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/api/account/file_upload/{scene}",
files=files,
headers={"Authorization": f"Lab {self.auth}"},
@@ -314,7 +322,7 @@ class HTTPClient:
"Content-Type": "application/json",
"Content-Encoding": "gzip",
}
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/lab/resource",
data=compressed_body,
headers=headers,
@@ -348,7 +356,7 @@ class HTTPClient:
Returns:
Response: API响应对象
"""
response = requests.get(
response = self._session.get(
f"{self.remote_addr}/edge/material/download",
headers={"Authorization": f"Lab {self.auth}"},
timeout=(3, 30),
@@ -409,7 +417,7 @@ class HTTPClient:
with open(os.path.join(BasicConfig.working_dir, "req_workflow_upload.json"), "w", encoding="utf-8") as f:
f.write(json.dumps(payload, indent=4, ensure_ascii=False))
response = requests.post(
response = self._session.post(
f"{self.remote_addr}/lab/workflow/owner/import",
json=payload,
headers={"Authorization": f"Lab {self.auth}"},

View File

@@ -754,6 +754,32 @@ class MessageProcessor:
req = JobAddReq(**data)
job_log = format_job_log(req.job_id, req.task_id, req.device_id, req.action)
# 服务端对always_free动作可能跳过query_action_state直接发job_start
# 此时job尚未注册需要自动补注册
existing_job = self.device_manager.get_job_info(req.job_id)
if not existing_job:
action_name = req.action
device_action_key = f"/devices/{req.device_id}/{action_name}"
action_always_free = self._check_action_always_free(req.device_id, action_name)
if action_always_free:
job_info = JobInfo(
job_id=req.job_id,
task_id=req.task_id,
device_id=req.device_id,
action_name=action_name,
device_action_key=device_action_key,
status=JobStatus.QUEUE,
start_time=time.time(),
always_free=True,
)
self.device_manager.add_queue_request(job_info)
logger.info(f"[MessageProcessor] Job {job_log} always_free, auto-registered from direct job_start")
else:
logger.error(f"[MessageProcessor] Job {job_log} not registered (missing query_action_state)")
return
success = self.device_manager.start_job(req.job_id)
if not success:
logger.error(f"[MessageProcessor] Failed to start job {job_log}")
@@ -1087,7 +1113,7 @@ class MessageProcessor:
"task_id": task_id,
"job_id": job_id,
"free": free,
"need_more": need_more,
"need_more": need_more + 1,
},
}
@@ -1227,7 +1253,7 @@ class QueueProcessor:
"task_id": job_info.task_id,
"job_id": job_info.job_id,
"free": False,
"need_more": 10,
"need_more": 10 + 1,
},
}
self.message_processor.send_message(message)
@@ -1243,7 +1269,13 @@ class QueueProcessor:
if not queued_jobs:
return
logger.debug(f"[QueueProcessor] Sending busy status for {len(queued_jobs)} queued jobs")
queue_summary = {}
for j in queued_jobs:
key = f"{j.device_id}/{j.action_name}"
queue_summary[key] = queue_summary.get(key, 0) + 1
logger.debug(
f"[QueueProcessor] Sending busy status for {len(queued_jobs)} queued jobs: {queue_summary}"
)
for job_info in queued_jobs:
# 快照可能已过期:在遍历过程中 end_job() 可能已将此 job 移至 READY
@@ -1260,7 +1292,7 @@ class QueueProcessor:
"task_id": job_info.task_id,
"job_id": job_info.job_id,
"free": False,
"need_more": 10,
"need_more": 10 + 1,
},
}
success = self.message_processor.send_message(message)
@@ -1343,6 +1375,10 @@ class WebSocketClient(BaseCommunicationClient):
self.message_processor = MessageProcessor(self.websocket_url, self.send_queue, self.device_manager)
self.queue_processor = QueueProcessor(self.device_manager, self.message_processor)
# running状态debounce缓存: {job_id: (last_send_timestamp, last_feedback_data)}
self._job_running_last_sent: Dict[str, tuple] = {}
self._job_running_debounce_interval: float = 10.0 # 秒
# 设置相互引用
self.message_processor.set_queue_processor(self.queue_processor)
self.message_processor.set_websocket_client(self)
@@ -1442,22 +1478,32 @@ class WebSocketClient(BaseCommunicationClient):
logger.debug(f"[WebSocketClient] Not connected, cannot publish job status for job_id: {item.job_id}")
return
job_log = format_job_log(item.job_id, item.task_id, item.device_id, item.action_name)
# 拦截最终结果状态,与原版本逻辑一致
if status in ["success", "failed"]:
self._job_running_last_sent.pop(item.job_id, None)
host_node = HostNode.get_instance(0)
if host_node:
# 从HostNode的device_action_status中移除job_id
try:
host_node._device_action_status[item.device_action_key].job_ids.pop(item.job_id, None)
except (KeyError, AttributeError):
logger.warning(f"[WebSocketClient] Failed to remove job {item.job_id} from HostNode status")
# logger.debug(f"[WebSocketClient] Intercepting final status for job_id: {item.job_id} - {status}")
# 通知队列处理器job完成包括timeout的job
self.queue_processor.handle_job_completed(item.job_id, status)
# 发送job状态消息
# running状态按job_id做debounce内容变化时仍然上报
if status == "running":
now = time.time()
cached = self._job_running_last_sent.get(item.job_id)
if cached is not None:
last_ts, last_data = cached
if now - last_ts < self._job_running_debounce_interval and last_data == feedback_data:
logger.trace(f"[WebSocketClient] Job status debounced (skip): {job_log} - {status}")
return
self._job_running_last_sent[item.job_id] = (now, feedback_data)
message = {
"action": "job_status",
"data": {
@@ -1473,7 +1519,6 @@ class WebSocketClient(BaseCommunicationClient):
}
self.message_processor.send_message(message)
job_log = format_job_log(item.job_id, item.task_id, item.device_id, item.action_name)
logger.trace(f"[WebSocketClient] Job status published: {job_log} - {status}")
def send_ping(self, ping_id: str, timestamp: float) -> None: