{
  "episode_id": "rider-delivery-03_w3SZVPiNxDU_390_450",
  "task_id": 9003,
  "task_name": "outdoor_food_delivery_dropoff_to_doorman",
  "task_text_en": "Full dropoff sequence: rider approaches a building, stops at the curb, dismounts, walks to the entrance, hands the food bag to the doorman, remounts and rides away.",
  "task_text_zh": "完整送达流程：骑手接近大楼、停靠路缘、下车、走向入口、把餐袋交给门童、重新上车并骑离。",
  "init_text_en": "Rider is on a tree-lined park-edge street approaching the destination building, which has construction scaffolding over its sidewalk entrance.",
  "init_text_zh": "骑手正沿一条树木夹道的公园路边接近目的地大楼，该楼人行道入口处搭有施工脚手架。",
  "robot": "Human Delivery Rider (Ego-POV Camera)",
  "robot_kinematics": "human + e-scooter · chest/helmet-mounted camera",
  "robot_sn": "rider-cam-001",
  "scene": "Outdoor",
  "sub_scene": "urban_delivery_dropoff_doorman_handoff",
  "data_gen_mode": "real_machine",
  "fps": 30,
  "duration_seconds": 60,
  "frame_count_video": 1800,
  "frame_count_annotated": 36,
  "annotation_sampling": "0.5fps base + 6 event keyframes around stop/dismount/handoff/remount",
  "videos": {
    "ego": {
      "src": "ego.mp4",
      "label": "EGO POV · CHEST CAM",
      "resolution": "568×320",
      "role": "primary"
    }
  },
  "annotation_layers": [
    "L1: 2D detection + tracking · 21 track IDs, 103 bbox total (doorman, food bag, revolving door, scaffolding, traffic infra)",
    "L3: ego action + state — full transition: riding → stopped → dismounting → walking → standing → remounting → riding",
    "L5: bilingual captions (7 clip-level · 13 event-level · 36 keyframe-level), covering the full dropoff narrative"
  ],
  "available_streams": [
    "RGB × 1 (ego POV, 60 s · 30 fps)",
    "L1 detection: 21 track IDs across 36 frames",
    "L3 ego state: covers ALL 7 mode states (riding/mounted/dismounting/walking/standing_at_door/remounting/riding)",
    "L4 narrative events: 13 — full dropoff sequence (approach → arrive → dismount → walk → meet doorman → handoff → remount → resume)",
    "L5 bilingual captions: 56 total entries (clip + event + keyframe)",
    "This is the most ego-state-rich sample of the three (clip 01 has 5 modes, clip 02 stays in 1 mode, clip 03 covers all 7)"
  ],
  "spans_source": "L5 keyframe + event captions, grouped by clip"
}
