From afa10cc1cbaac1798e9984128c2e83b5c90ab73b Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Tue, 15 Jul 2025 23:54:39 -0700
Subject: [PATCH 01/15] support openai api

---
 README.md             |   2 +-
 dvd/build_database.py |   6 +++
 dvd/config.py         |   4 ++
 dvd/dvd_core.py       |   1 +
 dvd/frame_caption.py  |   2 +
 dvd/utils.py          | 121 ++++++++++++++++++++++++++++--------------
 6 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index bdb2e7d..3637ccd 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ The `local_run.py` script provides an example of how to run the Deep Video Disco
 
 ## TODO
 
-- [ ] Support OpenAI API key configuration.
+- [x] Support OpenAI API key configuration.
 - [ ] Implement MCP server.
 - [ ] Release evaluation trajectory data on long video benchmarks.
 
diff --git a/dvd/build_database.py b/dvd/build_database.py
index 5361f11..51a3db5 100644
--- a/dvd/build_database.py
+++ b/dvd/build_database.py
@@ -102,6 +102,7 @@ def frame_inspect_tool(
         messages=input_msgs,
         endpoints=config.AOAI_TOOL_VLM_ENDPOINT_LIST,
         model_name=config.AOAI_TOOL_VLM_MODEL_NAME,
+        api_key=config.OPENAI_API_KEY,
         image_paths=files,
         temperature=0,
         max_tokens=512,
@@ -130,6 +131,7 @@ def clip_search_tool(
         endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST,
         model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME,
         input_text=[event_description],
+        api_key=config.OPENAI_API_KEY,
     )[0]['embedding']
     results = database.query(
         query_emb,
@@ -164,6 +166,7 @@ def global_browse_tool(
         endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST,
         model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME,
         input_text=[query],
+        api_key=config.OPENAI_API_KEY,
     )[0]['embedding']
     results = database.query(
         query_emb,
@@ -202,6 +205,7 @@ def global_browse_tool(
         messages=input_msgs,
         endpoints=config.AOAI_TOOL_VLM_ENDPOINT_LIST,
         model_name=config.AOAI_TOOL_VLM_MODEL_NAME,
+        api_key=config.OPENAI_API_KEY,
         temperature=0,
         max_tokens=512,
     )
@@ -318,6 +322,7 @@ def single_batch_embedding_task(data):
         endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST,
         model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME,
         input_text=captions,
+        api_key=config.OPENAI_API_KEY,
     )
     max_tries = 3
     while embs is None or len(embs) != len(captions):
@@ -329,6 +334,7 @@ def single_batch_embedding_task(data):
             endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST,
             model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME,
             input_text=captions,
+            api_key=config.OPENAI_API_KEY,
         )
     return list(zip(timestamps, cap_infos, [d['embedding'] for d in embs]))
 
diff --git a/dvd/config.py b/dvd/config.py
index e5e00fa..7bde7a5 100644
--- a/dvd/config.py
+++ b/dvd/config.py
@@ -1,3 +1,5 @@
+import os
+
 # ------------------ video download and segmentation configuration ------------------ #
 VIDEO_DATABASE_FOLDER = "./video_database/"
 VIDEO_RESOLUTION = "360" # denotes the height of the video 
@@ -5,6 +7,8 @@
 CLIP_SECS = 10 # seconds
 
 # ------------------ model configuration ------------------ #
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None) # will overwrite Azure OpenAI setting
+
 AOAI_CAPTION_VLM_ENDPOINT_LIST = [""]
 AOAI_CAPTION_VLM_MODEL_NAME = "gpt-4.1-mini"
 
diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py
index 656d6b8..142af9c 100644
--- a/dvd/dvd_core.py
+++ b/dvd/dvd_core.py
@@ -139,6 +139,7 @@ def run(self, question) -> list[dict]:
                 model_name=config.AOAI_ORCHESTRATOR_LLM_MODEL_NAME,
                 tools=self.function_schemas,
                 temperature=0.0,
+                api_key=config.OPENAI_API_KEY,
             )
             if response is None:
                 return None
diff --git a/dvd/frame_caption.py b/dvd/frame_caption.py
index eea8684..d0a36b8 100644
--- a/dvd/frame_caption.py
+++ b/dvd/frame_caption.py
@@ -233,6 +233,7 @@ def _caption_clip(task: Tuple[str, Dict], caption_ckpt_folder) -> Tuple[str, dic
             model_name=config.AOAI_CAPTION_VLM_MODEL_NAME,
             return_json=True,
             image_paths=files,
+            api_key=config.OPENAI_API_KEY,
         )["content"]
         if resp is None:
             continue
@@ -269,6 +270,7 @@ def merge_subject_registries(registries: List[dict]) -> dict:
             endpoints=config.AOAI_CAPTION_VLM_ENDPOINT_LIST,
             model_name=config.AOAI_CAPTION_VLM_MODEL_NAME,
             return_json=True,
+            api_key=config.OPENAI_API_KEY,
         )["content"]
         if resp is None:
             continue
diff --git a/dvd/utils.py b/dvd/utils.py
index 0233a99..077cb28 100644
--- a/dvd/utils.py
+++ b/dvd/utils.py
@@ -76,6 +76,7 @@ def call_openai_model_with_tools(
     messages,
     endpoints,
     model_name,
+    api_key: str = None,
     tools: list = [],  # List of tool definitions
     image_paths: list = [],  
     max_tokens: int = 4096,  
@@ -83,22 +84,32 @@ def call_openai_model_with_tools(
     tool_choice: str = "auto",  # Can be "auto", "none", or a specific tool
     return_json: bool = False,
 ) -> dict:  
-    credential = AzureCliCredential()  
-    token = credential.get_token('https://cognitiveservices.azure.com/')  
-    headers = {  
-        "Content-Type": "application/json",  
-        'Authorization': 'Bearer ' + token.token  
-    }  
-    if isinstance(endpoints, str):
-        endpoint = endpoints
-    elif isinstance(endpoints, list):
-        endpoint = random.choice(endpoints)
+    if api_key:
+        headers = {  
+            "Content-Type": "application/json",  
+            'Authorization': 'Bearer ' + api_key
+        }
+        endpoint = "https://api.openai.com/v1"
+        url = f"{endpoint}/chat/completions"
     else:
-        raise ValueError("Endpoints must be a string or a list of strings.")  
+        credential = AzureCliCredential()  
+        token = credential.get_token('https://cognitiveservices.azure.com/')  
+        headers = {  
+            "Content-Type": "application/json",  
+            'Authorization': 'Bearer ' + token.token  
+        }  
+        if isinstance(endpoints, str):
+            endpoint = endpoints
+        elif isinstance(endpoints, list):
+            endpoint = random.choice(endpoints)
+        else:
+            raise ValueError("Endpoints must be a string or a list of strings.")
+        url = f"{endpoint}/openai/deployments/{model_name}/chat/completions?api-version=2025-03-01-preview"
+
     model = model_name
-    url = f"{endpoint}/openai/deployments/{model}/chat/completions?api-version=2025-03-01-preview"  
       
     payload = {  
+        "model": model,
         "messages": copy.deepcopy(messages),  
         # "reasoning_effort": reasoning_effort,
     }  
@@ -140,7 +151,7 @@ def call_openai_model_with_tools(
 class AzureOpenAIEmbeddingService:  
     @staticmethod  
     @retry_with_exponential_backoff
-    def get_embeddings(endpoints, model_name, input_text):  
+    def get_embeddings(endpoints, model_name, input_text, api_key: str = None):  
         """  
         Call Azure OpenAI Embedding service and get embeddings for the input text.  
   
@@ -150,27 +161,35 @@ def get_embeddings(endpoints, model_name, input_text):
         :param input_text: The text for which you want to generate embeddings.  
         :return: The embeddings as a JSON response.  
         """  
-        if isinstance(endpoints, str):
-            endpoint = endpoints
-        elif isinstance(endpoints, list):
-            endpoint = random.choice(endpoints)
+        if api_key:
+            headers = {  
+                "Content-Type": "application/json",  
+                'Authorization': 'Bearer ' + api_key
+            }
+            endpoint = "https://api.openai.com/v1"
+            url = f"{endpoint}/embeddings"
         else:
-            raise ValueError("Endpoints must be a string or a list of strings.")  
+            if isinstance(endpoints, str):
+                endpoint = endpoints
+            elif isinstance(endpoints, list):
+                endpoint = random.choice(endpoints)
+            else:
+                raise ValueError("Endpoints must be a string or a list of strings.")  
+            # Define the URL for the embeddings endpoint  
+            url = f"{endpoint}/openai/deployments/{model_name}/embeddings?api-version=2023-05-15"  
+    
+            credential = AzureCliCredential()  
+            token = credential.get_token('https://cognitiveservices.azure.com/')  
+            headers = {  
+                "Content-Type": "application/json",  
+                'Authorization': 'Bearer ' + token.token  
+            }
+        
         model = model_name
-        if isinstance(endpoint, list):
-            endpoint = random.choice(endpoint)
-        # Define the URL for the embeddings endpoint  
-        url = f"{endpoint}/openai/deployments/{model}/embeddings?api-version=2023-05-15"  
-  
-        credential = AzureCliCredential()  
-        token = credential.get_token('https://cognitiveservices.azure.com/')  
-        headers = {  
-            "Content-Type": "application/json",  
-            'Authorization': 'Bearer ' + token.token  
-        }    
         # Set up the payload for the request  
         payload = {  
-            "input": input_text  
+            "input": input_text,
+            "model": model
         }  
   
         # Make the request to the Azure OpenAI service  
@@ -217,14 +236,34 @@ def extract_answer(message: dict) -> str | None:
 
 
 if __name__ == "__main__":
-    call_openai_model_with_tools(
-        messages=[{"role": "user", "content": "Hello, how are you?"}],
-        endpoints=["https://msra-im-openai-eus2.openai.azure.com"],
-        model_name="o3",
-        tools=[],
-        image_paths=[],
-        max_tokens=4096,
-        temperature=0.0,
-        tool_choice="auto",
-        return_json=False,
-    )
\ No newline at end of file
+    # Example for Azure
+    # call_openai_model_with_tools(
+    #     messages=[{"role": "user", "content": "Hello, how are you?"}],
+    #     endpoints=["https://msra-im-openai-eus2.openai.azure.com"],
+    #     model_name="o3",
+    #     tools=[],
+    #     image_paths=[],
+    #     max_tokens=4096,
+    #     temperature=0.0,
+    #     tool_choice="auto",
+    #     return_json=False,
+    # )
+
+    # Example for OpenAI
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if api_key:
+        response = call_openai_model_with_tools(
+            messages=[{"role": "user", "content": "Hello, how are you?"}],
+            endpoints=None, # Not used for OpenAI
+            model_name="gpt-4o",
+            api_key=api_key,
+            tools=[],
+            image_paths=[],
+            max_tokens=4096,
+            temperature=0.0,
+            tool_choice="auto",
+            return_json=False,
+        )
+        print(response)
+    else:
+        print("OPENAI_API_KEY environment variable not set.")
\ No newline at end of file

From 39bc683a2450964524cbfa752f850277cb580dc2 Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Wed, 16 Jul 2025 01:23:42 -0700
Subject: [PATCH 02/15] add lite mode to support audio-focused youtube video

---
 README.md            |  6 ++++
 dvd/config.py        |  1 +
 dvd/dvd_core.py      |  2 ++
 dvd/frame_caption.py | 21 ++++++++++++++
 dvd/video_utils.py   | 47 ++++++++++++++++++++++++++++++--
 local_run.py         | 65 ++++++++++++++++++++++++++++----------------
 6 files changed, 116 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 3637ccd..7fcd417 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,12 @@ This repository contains the official implementation of the paper [Deep Video Di
 ![image](https://github.com/user-attachments/assets/ac1c7f0a-3c10-4c4c-88d1-7bfe0e2010e1)
 
 
+## Update
+
+- **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis!
+- **2025/07/14**: Support OpenAI API and Azure OpenAI API.
+- **2025/07/08**: Initial release of the Deep Video Discovery codebase.
+
 ## Introduction
 
 **Deep Video Discovery (DVD)** is a deep-research style question answering agent designed for understanding extra-long videos. Leveraging the powerful capabilities of large language models (LLMs), DVD effectively interprets and processes extensive video content to answer complex user queries.
diff --git a/dvd/config.py b/dvd/config.py
index 7bde7a5..0659d71 100644
--- a/dvd/config.py
+++ b/dvd/config.py
@@ -24,6 +24,7 @@
 AOAI_EMBEDDING_LARGE_DIM = 3072
 
 # ------------------ agent and tool setting ------------------ #
+LITE_MODE = True # if True, only leverage srt subtitle, no pixel downloaded or pixel captioning
 GLOBAL_BROWSE_TOPK = 300
 OVERWRITE_CLIP_SEARCH_TOPK = 0 # 0 means no overwrite and let agent decide
 
diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py
index 142af9c..c3418f9 100644
--- a/dvd/dvd_core.py
+++ b/dvd/dvd_core.py
@@ -26,6 +26,8 @@ def finish(answer: A[str, D("Answer to the user's question.")]) -> None:
 class DVDCoreAgent:
     def __init__(self, video_db_path, video_caption_path, max_iterations):
         self.tools = [frame_inspect_tool, clip_search_tool, global_browse_tool, finish]
+        if config.LITE_MODE:
+            self.tools.remove(frame_inspect_tool)
         self.name_to_function_map = {tool.__name__: tool for tool in self.tools}
         self.function_schemas = [
             {"function": as_json_schema(func), "type": "function"}
diff --git a/dvd/frame_caption.py b/dvd/frame_caption.py
index d0a36b8..9ed7e0a 100644
--- a/dvd/frame_caption.py
+++ b/dvd/frame_caption.py
@@ -240,6 +240,8 @@ def _caption_clip(task: Tuple[str, Dict], caption_ckpt_folder) -> Tuple[str, dic
         try:
             assert isinstance(resp, str), f"Response must be a JSON string instead of {type(resp)}:{resp}."
             parsed = json.loads(resp)
+            parsed["clip_description"] += f"\n\nTranscript during this video clip: {transcript}." # add transcript to description
+            resp = json.dumps(parsed)
             with open(os.path.join(caption_ckpt_folder, f"{timestamp}.json"), "w") as f:
                 f.write(resp)
             return timestamp, parsed
@@ -329,6 +331,25 @@ def process_video(
         json.dump(frame_captions, f, indent=4)
 
 
+def process_video_lite(
+    output_caption_folder: str,
+    subtitle_file_path: str,
+):
+    """
+    Process video in LITE_MODE using SRT subtitles.
+    """
+    captions = parse_srt_to_dict(subtitle_file_path)
+    frame_captions = {}
+    for key, text in captions.items():
+        frame_captions[key] = {
+            "caption": f"\n\nTranscript during this video clip: {text}.",
+        }
+    frame_captions["subject_registry"] = {}
+    with open(
+        os.path.join(output_caption_folder, "captions.json"), "w"
+    ) as f:
+        json.dump(frame_captions, f, indent=4)
+
 # --------------------------------------------------------------------------- #
 #                                    main                                     #
 # --------------------------------------------------------------------------- #
diff --git a/dvd/video_utils.py b/dvd/video_utils.py
index 2b3840f..6660dfb 100644
--- a/dvd/video_utils.py
+++ b/dvd/video_utils.py
@@ -99,10 +99,51 @@ def load_video(
             )
             shutil.copy2(subtitle_source, subtitle_destination)
 
-        return os.path.abspath(destination_path)
+def download_srt_subtitle(video_url: str, output_path: str):
+    """Downloads an SRT subtitle from a YouTube URL."""
+    if not _is_youtube_url(video_url):
+        raise ValueError("Provided URL is not a valid YouTube link.")
+
+    output_dir = os.path.dirname(output_path)
+    os.makedirs(output_dir, exist_ok=True)
+
+    ydl_opts = {
+        'writesubtitles': True,
+        'subtitleslangs': ['en'],
+        'subtitlesformat': 'srt',
+        'skip_download': True,
+        'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'),
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(video_url, download=False)
+        video_id = info['id']
+        ydl.download([video_url])
+
+    # Locate the downloaded subtitle file (yt-dlp names them as <id>.<lang>.srt)
+    downloaded_subtitle_path = None
+    for f in os.listdir(output_dir):
+        if f.startswith(video_id) and f.endswith(".srt"):
+            downloaded_subtitle_path = os.path.join(output_dir, f)
+            break
+
+    if downloaded_subtitle_path:
+        shutil.move(downloaded_subtitle_path, output_path)
+    else:
+        # Try auto-generated subtitles
+        ydl_opts['writeautomaticsub'] = True
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl_auto:
+            ydl_auto.download([video_url])
+        
+        if os.path.exists(os.path.join(output_dir, f"{video_id}.en.vtt")):
+             # yt-dlp might download as .vtt and convert, check for final .srt
+            for f in os.listdir(output_dir):
+                if f.startswith(video_id) and f.endswith('.srt'):
+                    shutil.move(os.path.join(output_dir, f), output_path)
+                    return
+        
+        raise FileNotFoundError(f"Could not find SRT subtitle for {video_url}")
 
-    # ------------------- Not found -------------------
-    raise FileNotFoundError(f"Video source '{video_source}' not found or is not a valid URL.")
 
 def decode_video_to_frames(video_path: str) -> str:
     """
diff --git a/local_run.py b/local_run.py
index c08f239..c74b0a1 100644
--- a/local_run.py
+++ b/local_run.py
@@ -2,8 +2,8 @@
 import os
 import argparse
 from dvd.dvd_core import DVDCoreAgent
-from dvd.video_utils import load_video, decode_video_to_frames
-from dvd.frame_caption import process_video
+from dvd.video_utils import load_video, decode_video_to_frames, download_srt_subtitle
+from dvd.frame_caption import process_video, process_video_lite
 from dvd.utils import extract_answer
 
 def main():
@@ -27,31 +27,50 @@ def main():
     frames_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "frames")
     captions_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "captions")
     video_db_path = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "database.json")
+    srt_path = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "subtitles.srt")
 
-    # Download video
-    if not os.path.exists(video_path):
-        print(f"Downloading video from {video_url} to {video_path}...")
-        load_video(video_url, video_path)
-        print("Video downloaded.")
+    if config.LITE_MODE:
+        print("Running in LITE_MODE.")
+        if not os.path.exists(srt_path):
+            print(f"Downloading SRT subtitle for {video_url} to {srt_path}...")
+            try:
+                download_srt_subtitle(video_url, srt_path)
+                print("SRT subtitle downloaded.")
+            except Exception as e:
+                print(f"Error downloading subtitle: {e}")
+                print("Please turn off LITE_MODE and try again.")
+                return
+        else:
+            print(f"SRT subtitle already exists at {srt_path}.")
+        
+        # In LITE_MODE, we use srt as caption file
+        process_video_lite(captions_dir, srt_path)
+        caption_file = os.path.join(captions_dir, "captions.json")
     else:
-        print(f"Video already exists at {video_path}.")
+        # Download video
+        if not os.path.exists(video_path):
+            print(f"Downloading video from {video_url} to {video_path}...")
+            load_video(video_url, video_path)
+            print("Video downloaded.")
+        else:
+            print(f"Video already exists at {video_path}.")
 
-    # Decode video to frames
-    if not os.path.exists(frames_dir) or not os.listdir(frames_dir):
-        print(f"Decoding video to frames in {frames_dir}...")
-        decode_video_to_frames(video_path)
-        print("Video decoded.")
-    else:
-        print(f"Frames already exist in {frames_dir}.")
+        # Decode video to frames
+        if not os.path.exists(frames_dir) or not os.listdir(frames_dir):
+            print(f"Decoding video to frames in {frames_dir}...")
+            decode_video_to_frames(video_path)
+            print("Video decoded.")
+        else:
+            print(f"Frames already exist in {frames_dir}.")
 
-    # Get captions
-    caption_file = os.path.join(captions_dir, "captions.json")
-    if not os.path.exists(caption_file):
-        print("Processing video to get captions...")
-        process_video(frames_dir, captions_dir)
-        print("Captions generated.")
-    else:
-        print(f"Captions already exist at {caption_file}.")
+        # Get captions
+        caption_file = os.path.join(captions_dir, "captions.json")
+        if not os.path.exists(caption_file):
+            print("Processing video to get captions...")
+            process_video(frames_dir, captions_dir)
+            print("Captions generated.")
+        else:
+            print(f"Captions already exist at {caption_file}.")
 
     # Initialize DVDCoreAgent
     print("Initializing DVDCoreAgent...")

From 608f010c901e404e9248f47a87c922b807091811 Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Thu, 17 Jul 2025 02:36:11 -0700
Subject: [PATCH 03/15] add gradio demo

---
 .gitignore      |   1 +
 README.md       |   2 +
 app.py          | 276 ++++++++++++++++++++++++++++++++++++++++++++++++
 dvd/dvd_core.py |  56 ++++++++++
 4 files changed, 335 insertions(+)
 create mode 100644 app.py

diff --git a/.gitignore b/.gitignore
index e08cddb..4f09084 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 tmp/
 video_database/
 .git.bak/
+.gradio/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/README.md b/README.md
index 7fcd417..27f0344 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 
 [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079)
 [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT)
+[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](TODO)
 
 
 This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/).
@@ -11,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di
 
 ## Update
 
+- **2025/07/17**: Add gradio demo.
 - **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis!
 - **2025/07/14**: Support OpenAI API and Azure OpenAI API.
 - **2025/07/08**: Initial release of the Deep Video Discovery codebase.
diff --git a/app.py b/app.py
new file mode 100644
index 0000000..1a69102
--- /dev/null
+++ b/app.py
@@ -0,0 +1,276 @@
+import json
+import os, argparse, gradio as gr
+from dvd import config
+from dvd.dvd_core import DVDCoreAgent
+from dvd.video_utils import load_video, decode_video_to_frames, download_srt_subtitle
+from dvd.frame_caption import process_video, process_video_lite
+from dvd.utils import extract_answer
+
+########################################################################
+# Helper functions
+########################################################################
+def get_youtube_thumbnail(video_url: str):
+    """Extract YouTube video ID and return thumbnail URL."""
+    if not video_url:
+        return None
+    
+    # Extract video ID from YouTube URL
+    video_id = None
+    if "youtube.com/watch?v=" in video_url:
+        video_id = video_url.split("v=")[1].split("&")[0]
+    elif "youtu.be/" in video_url:
+        video_id = video_url.split("youtu.be/")[1].split("?")[0]
+    
+    if video_id:
+        # YouTube provides several thumbnail qualities
+        # maxresdefault > hqdefault > mqdefault > default
+        return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
+    
+    return None
+
+def _prepare_video_assets(video_url: str):
+    """Download / decode / caption the video exactly as in local_run.py,
+       returning (video_id, caption_file, video_db_path)."""
+    # --- reuse logic from local_run.py (trimmed for brevity) -------------
+    if "v=" in video_url:                         # YouTube URL
+        video_id = video_url.split("v=")[1]
+    else:                                         # local file or misc.
+        video_id = os.path.splitext(os.path.basename(video_url))[0]
+
+    video_path   = os.path.join(config.VIDEO_DATABASE_FOLDER, "raw", f"{video_id}.mp4")
+    frames_dir   = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "frames")
+    captions_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "captions")
+    video_db_path= os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "database.json")
+    srt_path     = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "subtitles.srt")
+
+    if config.LITE_MODE:
+        if not os.path.exists(srt_path):
+            download_srt_subtitle(video_url, srt_path)
+        process_video_lite(captions_dir, srt_path)
+        caption_file = os.path.join(captions_dir, "captions.json")
+    else:
+        if not os.path.exists(video_path):
+            load_video(video_url, video_path)
+        if not os.path.exists(frames_dir) or not os.listdir(frames_dir):
+            decode_video_to_frames(video_path)
+        caption_file = os.path.join(captions_dir, "captions.json")
+        if not os.path.exists(caption_file):
+            process_video(frames_dir, captions_dir)
+
+    return video_id, caption_file, video_db_path
+
+def solve(video_url: str, question: str):
+    """Streamed inference function used by Gradio."""
+    if not video_url or not question:
+        yield "❗ Please provide both a video URL and a question."
+        return
+
+    try:
+        yield "🔄 **Processing video...**"
+        _, caption_file, video_db_path = _prepare_video_assets(video_url)
+        
+        yield "🤖 **Initializing DVD agent...**"
+        agent = DVDCoreAgent(video_db_path, caption_file, config.MAX_ITERATIONS)
+
+        accumulated_text = "### 🎯 Analysis Process:\n"
+        final_answer = None
+        
+        for msg in agent.stream_run(question):
+            # Only process messages with a role attribute
+            if not isinstance(msg, dict) or "role" not in msg:
+                continue
+                
+            # Show assistant's thinking process
+            if msg.get("role") == "assistant":
+                content = msg.get("content", "")
+                if content:
+                    accumulated_text += f"\n\n**🤔 Assistant Thinking:**\n{content}"
+                    yield accumulated_text
+                    
+                # Check if assistant called the finish function
+                tool_calls = msg.get("tool_calls", [])
+                for tc in tool_calls:
+                    if tc.get("function", {}).get("name") == "finish":
+                        try:
+                            args = json.loads(tc.get("function", {}).get("arguments", "{}"))
+                            final_answer = args.get("answer", "")
+                        except:
+                            pass
+            
+            # Show when a tool is being called
+            elif msg.get("role") == "tool_call":
+                tool_name = msg.get("name", "unknown")
+                tool_args = msg.get("arguments", "{}")
+                try:
+                    args_dict = json.loads(tool_args)
+                    args_dict.pop("database", None)
+                    # Format arguments nicely
+                    args_str = json.dumps(args_dict, indent=2)
+                except:
+                    args_str = tool_args
+                if tool_name != "finish":
+                    accumulated_text += f"\n\n**🔄 Calling Tool:** `{tool_name}`\n```json\n{args_str}\n```"
+                yield accumulated_text
+                            
+            # Show tool observations
+            elif msg.get("role") == "tool":
+                tool_name = msg.get("name", "unknown")
+                tool_result = msg.get("content", "")
+                
+                # Truncate long results for display
+                if len(tool_result) > 2000:
+                    tool_result = tool_result[:2000] + "..."
+                    
+                accumulated_text += f"\n\n**✅ Tool Result `{tool_name}`:**\n```\n{tool_result}\n```"
+                yield accumulated_text
+        
+        # Add final answer if found
+        if final_answer:
+            accumulated_text += f"### ✅📃 **Final Answer:**\n\n{final_answer}"
+        else:
+            accumulated_text += "\n\n---\n### ✅ **Analysis Complete!**"
+            
+        yield accumulated_text
+                
+    except Exception as e:
+        import traceback
+        yield f"### ⚠️ Error Occurred\n\n```\n{e}\n```\n\nDetails:\n```\n{traceback.format_exc()}\n```"
+
+########################################################################
+# Gradio UI
+########################################################################
+def launch(args):
+    # Custom CSS for better styling
+    custom_css = """
+    .gradio-container {
+        font-family: 'Inter', sans-serif;
+    }
+    .markdown-text {
+        font-size: 16px;
+    }
+    #answer-box {
+        border: 2px solid #e5e7eb;
+        border-radius: 8px;
+        padding: 20px;
+        background-color: #f9fafb;
+        min-height: 400px;
+        max-height: 600px;
+        overflow-y: auto;
+    }
+    .button-primary {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        font-weight: bold;
+        font-size: 18px;
+        padding: 12px 24px;
+    }
+    #video-thumbnail {
+        border-radius: 8px;
+        box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1);
+    }
+    """
+    
+    with gr.Blocks(title="DVD Video Q&A Demo", css=custom_css, theme=gr.themes.Soft()) as demo:
+        gr.Markdown(
+            """
+            # 🎬 Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding
+            
+            <p style="font-size: 18px; color: #6b7280;">
+            Provide a YouTube URL, then ask any question about the video content. 
+            The system will analyze the video and provide detailed answers. 
+            Note that this online demo only provides lite mode of DVD where only subtitles are used. 
+            To use full DVD capabilities, please deploy it locally.
+            </p>
+            """
+        )
+        
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 📹 Video Input")
+                video_url = gr.Textbox(
+                    label="Video URL / Path",
+                    placeholder="e.g. https://www.youtube.com/watch?v=dQw4w9WgXcQ",
+                    lines=1,
+                    info="Support YouTube URLs or local video paths"
+                )
+                
+                # Add video thumbnail
+                video_thumbnail = gr.Image(
+                    label="Video Thumbnail",
+                    elem_id="video-thumbnail",
+                    height=200,
+                    visible=False,
+                    interactive=False
+                )
+                
+                gr.Markdown("### ❓ Your Question")
+                question = gr.Textbox(
+                    label="Question about the video",
+                    placeholder="What happens in this video? Who are the main characters?",
+                    lines=3,
+                    info="Ask anything about the video content"
+                )
+                
+                with gr.Row():
+                    run_btn = gr.Button("🔍 Analyze Video", variant="primary", elem_classes=["button-primary"])
+                    clear_btn = gr.ClearButton([video_url, question, video_thumbnail], value="🗑️ Clear")
+                
+                gr.Markdown("### 💡 Example Questions")
+                examples = gr.Examples(
+                    examples=[
+                        ["https://www.youtube.com/watch?v=i2qSxMVeVLI", "What is the main topic discussed in this video?"],
+                        ["https://www.youtube.com/watch?v=nOxKexn3iBo", "Who are the speakers and what are their key points?"],
+                    ],
+                    inputs=[video_url, question],
+                    label=""
+                )
+                
+            with gr.Column(scale=2):
+                gr.Markdown("### 📊 Analysis Results")
+                answer_box = gr.Markdown(
+                    value="*Results will appear here after clicking 'Analyze Video'...*",
+                    elem_id="answer-box",
+                    label=""
+                )
+        
+        gr.Markdown(
+            """
+            ---
+            <p style="text-align: center; color: #9ca3af; font-size: 14px;">
+            DVD: Powered by advanced video understanding and language models | 
+            <a href="https://github.com/your-repo" style="color: #6366f1;">GitHub</a>
+            </p>
+            """
+        )
+        
+        # Event handlers
+        def update_thumbnail(url):
+            """Update thumbnail when URL changes."""
+            thumbnail_url = get_youtube_thumbnail(url)
+            if thumbnail_url:
+                return gr.update(value=thumbnail_url, visible=True)
+            else:
+                return gr.update(value=None, visible=False)
+        
+        video_url.change(
+            fn=update_thumbnail,
+            inputs=[video_url],
+            outputs=[video_thumbnail]
+        )
+        
+        import inspect
+        click_kwargs = dict(fn=solve, inputs=[video_url, question], outputs=answer_box)
+        if "stream" in inspect.signature(gr.Button.click).parameters:
+            click_kwargs["stream"] = True
+        run_btn.click(**click_kwargs)
+
+    demo.launch(share=args.share)
+
+########################################################################
+# CLI entry-point (optional)
+########################################################################
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", help="Gradio share flag")
+    args = parser.parse_args()
+    launch(args)
\ No newline at end of file
diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py
index c3418f9..630d181 100644
--- a/dvd/dvd_core.py
+++ b/dvd/dvd_core.py
@@ -158,6 +158,62 @@ def run(self, question) -> list[dict]:
 
         return msgs
 
+    # ------------------------------------------------------------------ #
+    # Streaming (generator) loop
+    # ------------------------------------------------------------------ #
+    def stream_run(self, question):
+        """
+        A generator version of `run`.  
+        Yields:
+            dict: every assistant / tool message produced during reasoning.
+        """
+        msgs = copy.deepcopy(self.messages)
+        msgs[-1]["content"] = msgs[-1]["content"].replace("QUESTION_PLACEHOLDER", question)
+
+        for i in range(self.max_iterations):
+            # Force a final `finish` on the last iteration
+            if i == self.max_iterations - 1:
+                final_usr_msg = {
+                    "role": "user",
+                    "content": "Please call the `finish` function to finish the task.",
+                }
+                msgs.append(final_usr_msg)
+                # Don't yield user messages to the UI
+
+            response = call_openai_model_with_tools(
+                msgs,
+                endpoints=config.AOAI_ORCHESTRATOR_LLM_ENDPOINT_LIST,
+                model_name=config.AOAI_ORCHESTRATOR_LLM_MODEL_NAME,
+                tools=self.function_schemas,
+                temperature=0.0,
+                api_key=config.OPENAI_API_KEY,
+            )
+            if response is None:
+                return
+
+            response.setdefault("role", "assistant")
+            msgs.append(response)
+            yield response  # ← stream assistant reply
+
+            # Execute any requested tool calls
+            try:
+                for tool_call in response.get("tool_calls", []):
+                    # Yield a formatted message about the tool being called
+                    tool_name = tool_call.get("function", {}).get("name", "unknown")
+                    tool_args = tool_call.get("function", {}).get("arguments", "{}")
+                    yield {
+                        "role": "tool_call",
+                        "name": tool_name,
+                        "arguments": tool_args
+                    }
+                    
+                    self._exec_tool(tool_call, msgs)
+                    # Only yield the tool result message
+                    if msgs[-1].get("role") == "tool":
+                        yield msgs[-1]  # ← stream tool observation
+            except StopException:
+                return
+
 
 def single_run_wrapper(info) -> dict:
     qid, video_db_path, video_caption_path, question = info

From a74611ebfee7cd2aff4e06f93cb53a07c4504302 Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Thu, 17 Jul 2025 03:58:17 -0700
Subject: [PATCH 04/15] update README and demo link

---
 README.md | 2 +-
 app.py    | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 27f0344..8826ee6 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079)
 [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT)
-[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](TODO)
+[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://2b295b16087857c68f.gradio.live/)
 
 
 This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/).
diff --git a/app.py b/app.py
index 1a69102..d7fcf0a 100644
--- a/app.py
+++ b/app.py
@@ -42,6 +42,9 @@ def _prepare_video_assets(video_url: str):
     captions_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "captions")
     video_db_path= os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "database.json")
     srt_path     = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "subtitles.srt")
+    os.makedirs(os.path.join(config.VIDEO_DATABASE_FOLDER, "raw"), exist_ok=True)
+    os.makedirs(frames_dir, exist_ok=True)
+    os.makedirs(captions_dir, exist_ok=True)
 
     if config.LITE_MODE:
         if not os.path.exists(srt_path):
@@ -126,7 +129,7 @@ def solve(video_url: str, question: str):
         
         # Add final answer if found
         if final_answer:
-            accumulated_text += f"### ✅📃 **Final Answer:**\n\n{final_answer}"
+            accumulated_text += f"\n### 📃✅ **Final Answer:**\n\n{final_answer}"
         else:
             accumulated_text += "\n\n---\n### ✅ **Analysis Complete!**"
             

From 4c879e245e4dece1a723a9a3700ec656a874b76d Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Mon, 21 Jul 2025 05:47:29 -0700
Subject: [PATCH 05/15] update requirements

---
 README.md        | 5 +++++
 requirements.txt | 8 ++++++++
 2 files changed, 13 insertions(+)
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index 8826ee6..10c2730 100644
--- a/README.md
+++ b/README.md
@@ -53,6 +53,11 @@ The core design of DVD includes:
    pip install -r requirements.txt
    ```
 
+3. (Optional) **Install gradio for demo:**
+   ```bash
+   pip install gradio
+   ```
+
 ## Usage
 
 Note: Set up your configuration by updating the variables in  `config.py`.
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9b7d3e0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+numpy<2
+opencv-python-headless
+tqdm
+nano-vectordb
+azure-identity
+requests
+yt-dlp
+openai

From 95740e5f3f1f9305dfccf094567f280fde13d303 Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Wed, 23 Jul 2025 10:42:02 -0700
Subject: [PATCH 06/15] update demo link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 10c2730..7cb5578 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079)
 [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT)
-[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://2b295b16087857c68f.gradio.live/)
+[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://e1b3f5d6045818fdc6.gradio.live)
 
 
 This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/).

From 217b9e2d85ba54916605f9f2cdd190414d8deeca Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Thu, 31 Jul 2025 03:55:59 -0700
Subject: [PATCH 07/15] update README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7cb5578..70bac8b 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079)
 [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT)
-[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://e1b3f5d6045818fdc6.gradio.live)
+[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://71d30852e95c6c7dd9.gradio.live)
 
 
 This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/).

From 3cc2495e207c9a2f98433ce8644cd1aa0f81019f Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Fri, 1 Aug 2025 08:47:21 -0700
Subject: [PATCH 08/15] fix auto subtitle bug

---
 README.md          |  2 +-
 dvd/video_utils.py | 16 ++--------------
 2 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 70bac8b..7087c1a 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079)
 [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT)
-[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://71d30852e95c6c7dd9.gradio.live)
+[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://87fc7dc81d4b38ed01.gradio.live)
 
 
 This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/).
diff --git a/dvd/video_utils.py b/dvd/video_utils.py
index 6660dfb..7f6c891 100644
--- a/dvd/video_utils.py
+++ b/dvd/video_utils.py
@@ -109,9 +109,9 @@ def download_srt_subtitle(video_url: str, output_path: str):
 
     ydl_opts = {
         'writesubtitles': True,
-        'subtitleslangs': ['en'],
         'subtitlesformat': 'srt',
         'skip_download': True,
+        'writeautomaticsub': True,
         'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'),
     }
 
@@ -130,18 +130,6 @@ def download_srt_subtitle(video_url: str, output_path: str):
     if downloaded_subtitle_path:
         shutil.move(downloaded_subtitle_path, output_path)
     else:
-        # Try auto-generated subtitles
-        ydl_opts['writeautomaticsub'] = True
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl_auto:
-            ydl_auto.download([video_url])
-        
-        if os.path.exists(os.path.join(output_dir, f"{video_id}.en.vtt")):
-             # yt-dlp might download as .vtt and convert, check for final .srt
-            for f in os.listdir(output_dir):
-                if f.startswith(video_id) and f.endswith('.srt'):
-                    shutil.move(os.path.join(output_dir, f), output_path)
-                    return
-        
         raise FileNotFoundError(f"Could not find SRT subtitle for {video_url}")
 
 
@@ -192,4 +180,4 @@ def decode_video_to_frames(video_path: str) -> str:
     return os.path.abspath(frames_dir)
 
 if __name__ == "__main__":
-    decode_video_to_frames("/home/xiaoyizhang/DVD/video_database/raw/i2qSxMVeVLI.mp4")
\ No newline at end of file
+    download_srt_subtitle("https://www.youtube.com/watch?v=PQFQ-3d2J-8", "./video_database/PQFQ-3d2J-8/subtitles.srt")
\ No newline at end of file

From f0b6e33b0d6c7b2e1e4a574071de7e6caedf17e0 Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Fri, 1 Aug 2025 09:08:52 -0700
Subject: [PATCH 09/15] update README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 7087c1a..5a5a042 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di
 
 ## Update
 
+- **2025/08/02**: Support auto subtitle in the demo.
 - **2025/07/17**: Add gradio demo.
 - **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis!
 - **2025/07/14**: Support OpenAI API and Azure OpenAI API.

From b608f8db8e9eccc685a3726de374d00dbeb49730 Mon Sep 17 00:00:00 2001
From: Xiaoyi <xiaoyizhang@microsoft.com>
Date: Mon, 4 Aug 2025 01:52:07 -0700
Subject: [PATCH 10/15] upload captions

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5a5a042..2ea80bc 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di
 
 ## Update
 
+- **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6).
 - **2025/08/02**: Support auto subtitle in the demo.
 - **2025/07/17**: Add gradio demo.
 - **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis!

From 60217809751c892b3fd548163b57a147b84bd715 Mon Sep 17 00:00:00 2001
From: anonymous626 <131758638+anonymous626@users.noreply.github.com>
Date: Wed, 6 Aug 2025 14:53:31 +0800
Subject: [PATCH 11/15] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2ea80bc..20eae2d 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ Note: Set up your configuration by updating the variables in  `config.py`.
 The `local_run.py` script provides an example of how to run the Deep Video Discovery agent by providing a youtube url and question about it.
 
     ```bash
-    python local_run.py https://www.youtube.com/watch?v=ktbGziZlt3c "how many animals appear in this video"
+    python local_run.py https://www.youtube.com/watch?v=PQFQ-3d2J-8 "what did the main speaker talk about in the last part of video?"
     ```
 
 ## TODO

From 60ed1567ff098391cc158966e5fa8af004cedb63 Mon Sep 17 00:00:00 2001
From: anonymous626 <131758638+anonymous626@users.noreply.github.com>
Date: Fri, 19 Sep 2025 15:41:09 +0800
Subject: [PATCH 12/15] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 20eae2d..58aaebf 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di
 
 ## Update
 
+- **2025/09/19**: Accepted by NeurIPS 2025 🎉
 - **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6).
 - **2025/08/02**: Support auto subtitle in the demo.
 - **2025/07/17**: Add gradio demo.

From 68ad71428545b690224094bcbd6e66faa0ce18f1 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <azureuser@msraima100gpu009.sodzjw3wlboepoqhpte4arfkbg.px.internal.cloudapp.net>
Date: Wed, 15 Oct 2025 14:14:16 +0000
Subject: [PATCH 13/15] update reproduce guide

---
 README.md                     |   1 +
 dvd/dvd_core.py               |  21 ++++
 dvd/utils.py                  |   9 +-
 reproduce/REPRODUCE.md        |  29 +++++
 reproduce/decode_frames.py    | 221 ++++++++++++++++++++++++++++++++++
 reproduce/download_lvbench.sh |   5 +
 reproduce/prepare_database.py |  52 ++++++++
 reproduce/run_benchmark.py    |  57 +++++++++
 8 files changed, 391 insertions(+), 4 deletions(-)
 create mode 100644 reproduce/REPRODUCE.md
 create mode 100644 reproduce/decode_frames.py
 create mode 100644 reproduce/download_lvbench.sh
 create mode 100644 reproduce/prepare_database.py
 create mode 100644 reproduce/run_benchmark.py

diff --git a/README.md b/README.md
index 58aaebf..805df16 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di
 
 ## Update
 
+- **2025/10/15**: Add a markdown to help to reproduce the LVBench results: [REPRODUCE.md](reproduce/REPRODUCE.md). Email me (xiaoyizhang@microsoft) if your issue get no response in 24 hours.
 - **2025/09/19**: Accepted by NeurIPS 2025 🎉
 - **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6).
 - **2025/08/02**: Support auto subtitle in the demo.
diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py
index 630d181..db5f014 100644
--- a/dvd/dvd_core.py
+++ b/dvd/dvd_core.py
@@ -9,6 +9,7 @@
 from dvd.func_call_shema import as_json_schema
 from dvd.func_call_shema import doc as D
 from dvd.utils import call_openai_model_with_tools
+from concurrent.futures import ThreadPoolExecutor, as_completed
 
 TOPK = 16
 
@@ -158,6 +159,26 @@ def run(self, question) -> list[dict]:
 
         return msgs
 
+    def parallel_run(self, questions, max_workers=4) -> list[list[dict]]:
+        """
+        Run multiple questions in parallel.
+        """
+        results = []
+        results = [None] * len(questions)
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_index = {
+                executor.submit(self.run, q): idx
+                for idx, q in enumerate(questions)
+            }
+            for future in as_completed(future_to_index):
+                idx = future_to_index[future]
+                try:
+                    results[idx] = future.result()
+                except Exception as e:
+                    print(f"Error processing question: {e}")
+                    results[idx] = None
+        return results
+
     # ------------------------------------------------------------------ #
     # Streaming (generator) loop
     # ------------------------------------------------------------------ #
diff --git a/dvd/utils.py b/dvd/utils.py
index 077cb28..663bbf4 100644
--- a/dvd/utils.py
+++ b/dvd/utils.py
@@ -222,16 +222,17 @@ def extract_answer(message: dict) -> str | None:
     str | None
         The extracted answer, or ``None`` if no answer could be found.
     """
-    # Direct text response
-    if (content := message.get("content")):
-        return content.strip()
-
     # Tool-based response
     for call in message.get("tool_calls", []):
         args_json = call["function"]["arguments"]
         args = json.loads(args_json)
         if (answer := args.get("answer")):
             return answer
+
+    # Direct text response
+    if (content := message.get("content")):
+        return content.strip()
+    
     return None
 
 
diff --git a/reproduce/REPRODUCE.md b/reproduce/REPRODUCE.md
new file mode 100644
index 0000000..53d547d
--- /dev/null
+++ b/reproduce/REPRODUCE.md
@@ -0,0 +1,29 @@
+# Reproduce
+
+0. Setup database root path with `export DATABASE_DIR=/path/to/your/database/folder`.
+
+1. Download the pre-built database from [here](https://huggingface.co/datasets/xyzhang626/LongVideoBenchmarkCaptions/tree/main). Or you can use the script `wget https://huggingface.co/datasets/xyzhang626/LongVideoBenchmarkCaptions/resolve/main/LVBench_4.1.zip` to download the database.
+
+2. Prepare the database json files. You can use the script in `prepare_lvbench_db.py` to prepare the database json files. Please modify the path to your downloaded LVBench database. It will generate the database json files into `$DATABASE_DIR/LVBench_4.1`.
+
+```bash
+python -m reproduce.prepare_database /path/to/your/zipfile $DATABASE_DIR
+```
+
+3. Download LVBench dataset, you could find this 3rd party assets in [here](https://huggingface.co/datasets/AIWinter/LVBench/tree/main). Or you can use the script to download the dataset.
+
+```bash
+export TARGET_DIR=$DATABASE_DIR/LVBench_4.1
+bash reproduce/download_lvbench.sh
+```
+
+4. Decode the videos into raw frames, you could use the script in `decode_frames.py`, please modify the path to your downloaded LVBench dataset.
+
+```bash
+python -m reproduce.decode_frames --part $DATABASE_DIR/LVBench_4.1/all_videos_split.zip.001 --out $TARGET_DIR --fps 2
+```
+
+5. Run the benchmark. You can use the script in `run_benchmark.py` to run the benchmark. Please modify the path to your prepared database json files.
+```bash
+python -m reproduce.run_benchmark $TARGET_DIR  $TARGET_DIR/video_info.meta.jsonl
+```
diff --git a/reproduce/decode_frames.py b/reproduce/decode_frames.py
new file mode 100644
index 0000000..6788b49
--- /dev/null
+++ b/reproduce/decode_frames.py
@@ -0,0 +1,221 @@
+import argparse
+import os
+import re
+import sys
+import tempfile
+import zipfile
+import shutil
+import logging
+from pathlib import Path
+from typing import List, Iterable
+import cv2
+from tqdm import tqdm
+import multiprocessing as mp
+
+VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.mpg', '.mpeg', '.m4v'}
+
+
+def find_all_parts(part_path: Path) -> List[Path]:
+    """
+    Given any split archive file, find all parts in the same directory and return them in order.
+    Supports *.zip.001 / *.zip.002 or *.z01 / *.z02 + .zip formats (compatible with common naming).
+    """
+    name = part_path.name
+    parent = part_path.parent
+
+    # Match something like something.zip.001 or something.zip.002
+    m = re.match(r'(.+\.zip)\.(\d{3})$', name)
+    if m:
+        base = m.group(1)
+        parts = sorted(parent.glob(base + ".???"))
+        return parts
+
+    # Match WinZip style: something.z01, something.z02 ... + something.zip
+    m2 = re.match(r'(.+)\.z(\d{2})$', name, re.IGNORECASE)
+    if m2:
+        base_prefix = m2.group(1)
+        zparts = sorted(parent.glob(base_prefix + ".z??"), key=lambda p: p.suffix.lower())
+        main_zip = parent / (base_prefix + ".zip")
+        if main_zip.exists():
+            return zparts + [main_zip]
+
+    # If it's a complete zip file
+    if name.endswith(".zip"):
+        return [part_path]
+
+    raise ValueError(f"Unrecognized split archive naming: {name}")
+
+
+def assemble_zip(parts: List[Path]) -> Path:
+    """
+    Concatenate all parts in order into a temporary zip file and return its path.
+    If there is only one .zip file, return its path directly (no copy).
+    """
+    if len(parts) == 1 and parts[0].suffix == ".zip":
+        return parts[0]
+
+    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".zip", prefix="merged_zip_")
+    os.close(tmp_fd)
+    with open(tmp_path, "wb") as w:
+        for p in parts:
+            logging.info(f"Merging part: {p.name}")
+            with open(p, "rb") as r:
+                shutil.copyfileobj(r, w, length=1024 * 1024)
+    return Path(tmp_path)
+
+
+def iter_video_members(zf: zipfile.ZipFile) -> Iterable[zipfile.ZipInfo]:
+    for info in zf.infolist():
+        if info.is_dir():
+            continue
+        ext = Path(info.filename).suffix.lower()
+        if ext in VIDEO_EXTS:
+            yield info
+
+
+def ensure_dir(path: Path):
+    path.mkdir(parents=True, exist_ok=True)
+
+
+def decode_video(temp_video_path: Path, out_root: Path, fps: float, overwrite: bool = False, video_stem: str | None = None):
+    """
+    Extract frames at the given fps and save them.
+    video_stem: Pass the original video filename (without extension) to avoid using the temporary filename.
+    """
+    cap = cv2.VideoCapture(str(temp_video_path))
+    if not cap.isOpened():
+        logging.error(f"Cannot open video: {temp_video_path}")
+        return
+
+    orig_fps = cap.get(cv2.CAP_PROP_FPS) or 0
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
+    if orig_fps <= 0:
+        logging.warning(f"Original FPS abnormal ({orig_fps}), will estimate by frame time.")
+        orig_fps = fps  # fallback
+
+    interval = 1.0 / fps
+    next_t = 0.0
+    frame_index = 0
+    saved_index = 0
+
+    # Use original filename (if provided)
+    video_stem = video_stem or temp_video_path.stem
+    frames_dir = out_root / video_stem / "frames"
+    ensure_dir(frames_dir)
+
+    if not overwrite:
+        # Count existing frames, auto-continue
+        existing = sorted(frames_dir.glob("frames_n*.jpg"))
+        if existing:
+            last = existing[-1].stem
+            m = re.search(r'frames_n(\d+)', last)
+            if m:
+                saved_index = int(m.group(1))
+                logging.info(f"{video_stem}: Append mode, {saved_index} frames already exist.")
+    
+    pbar = tqdm(total=total_frames if total_frames > 0 else None,
+                desc=f"Decoding {video_stem}",
+                unit="f",
+                dynamic_ncols=True)
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        current_t = frame_index / orig_fps
+        if current_t + 1e-6 >= next_t:
+            saved_index += 1
+            out_path = frames_dir / f"frames_n{saved_index:06d}.jpg"
+            if overwrite or not out_path.exists():
+                cv2.imwrite(str(out_path), frame, [int(cv2.IMWRITE_JPEG_QUALITY), 95])
+            next_t += interval
+        frame_index += 1
+        pbar.update(1)
+    pbar.close()
+    cap.release()
+    logging.info(f"{video_stem}: Frame extraction complete, total (including existing) {saved_index} frames.")
+
+
+def process_archive(part_path: Path, out_root: Path, fps: float, overwrite: bool):
+    parts = find_all_parts(part_path)
+    logging.info("Found parts in order: " + ", ".join(p.name for p in parts))
+    merged_zip = assemble_zip(parts)
+    cleanup_needed = merged_zip not in parts  # If we generated a temporary file
+    try:
+        with zipfile.ZipFile(merged_zip, 'r') as zf:
+            video_members = list(iter_video_members(zf))
+            logging.info(f"Number of video files in archive: {len(video_members)}")
+            temp_files = []  # (process, tmp_path)
+            max_workers = min(len(video_members), mp.cpu_count() or 1)
+            logging.info(f"Parallel decoding: using {max_workers} processes")
+
+            def wait_one():
+                # Wait for the earliest started process to finish and clean up temp file
+                proc, tpath = temp_files.pop(0)
+                proc.join()
+                try:
+                    tpath.unlink(missing_ok=True)
+                except Exception:
+                    pass
+
+            for info in video_members:
+                # Write this video to a temporary file
+                suffix = Path(info.filename).suffix
+                with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+                    tmp.write(zf.read(info))
+                    tmp_path = Path(tmp.name)
+
+                original_stem = Path(info.filename).stem
+                # Start a separate process for decoding, passing the original filename
+                p = mp.Process(target=decode_video, args=(tmp_path, out_root, fps), kwargs={'overwrite': overwrite, 'video_stem': original_stem})
+                p.start()
+                temp_files.append((p, tmp_path))
+
+                # If reached parallel limit, wait for the earliest one to finish
+                if len(temp_files) >= max_workers:
+                    wait_one()
+
+            # Wait for all remaining to finish
+            while temp_files:
+                wait_one()
+    finally:
+        if cleanup_needed:
+            try:
+                merged_zip.unlink(missing_ok=True)
+            except Exception:
+                pass
+
+
+def parse_args():
+    ap = argparse.ArgumentParser(description="Extract video frames from split zip archives at a given fps")
+    ap.add_argument("--part", required=True, help="Path to any split archive file (e.g. all_videos_split.zip.002)")
+    ap.add_argument("--out", required=True, help="Output root directory")
+    ap.add_argument("--fps", type=float, required=True, help="Target frame extraction rate (e.g. 5)")
+    ap.add_argument("--overwrite", action="store_true", help="Overwrite existing frames")
+    ap.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"])
+    return ap.parse_args()
+
+def main():
+    args = parse_args()
+    logging.basicConfig(
+        level=getattr(logging, args.log_level),
+        format="%(asctime)s [%(levelname)s] %(message)s",
+    )
+
+    part_path = Path(args.part).expanduser().resolve()
+    out_root = Path(args.out).expanduser().resolve()
+    ensure_dir(out_root)
+
+    if args.fps <= 0:
+        logging.error("fps must be > 0")
+        sys.exit(1)
+
+    if not part_path.exists():
+        logging.error(f"File does not exist: {part_path}")
+        sys.exit(1)
+
+    process_archive(part_path, out_root, args.fps, overwrite=args.overwrite)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/reproduce/download_lvbench.sh b/reproduce/download_lvbench.sh
new file mode 100644
index 0000000..999d999
--- /dev/null
+++ b/reproduce/download_lvbench.sh
@@ -0,0 +1,5 @@
+mkdir -p "$TARGET_DIR"
+printf '%03d\n' {3..14} | \
+wget --continue -P "$TARGET_DIR" "https://huggingface.co/datasets/zai-org/LVBench/resolve/main/video_info.meta.jsonl"
+xargs -P 8 -I{} wget --continue -P "$TARGET_DIR" \
+  "https://huggingface.co/datasets/AIWinter/LVBench/resolve/main/all_videos_split.zip.{}"
diff --git a/reproduce/prepare_database.py b/reproduce/prepare_database.py
new file mode 100644
index 0000000..d37230e
--- /dev/null
+++ b/reproduce/prepare_database.py
@@ -0,0 +1,52 @@
+import os
+import json
+import zipfile
+from pathlib import Path
+import argparse
+
+def replace_root_path(zip_file_path, database_dir):
+    """
+    Read a zip file, replace 'video_file_root' in JSON files, and save to the specified directory.
+    
+    Args:
+        zip_file_path: Path to the zip file.
+        database_dir: Directory for the database.
+    """
+    zip_file_name = Path(zip_file_path).stem
+
+    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+        # Iterate through all files in the zip
+        for file_name in zip_ref.namelist():
+            if file_name.endswith('.json'):
+                # Read the JSON file
+                with zip_ref.open(file_name) as json_file:
+                    data = json.load(json_file)
+                
+                # Replace video_file_root
+                new_root = os.path.join(database_dir, zip_file_name)
+                data['video_file_root'] = new_root
+                
+                # Create output directory
+                json_name = Path(file_name).stem
+                output_dir = os.path.join(database_dir, zip_file_name, json_name)
+                os.makedirs(output_dir, exist_ok=True)
+                
+                # Save the JSON file
+                output_path = os.path.join(output_dir, 'database.json')
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, ensure_ascii=False, indent=2)
+                
+                print(f"Processed: {file_name} -> {output_path}")
+
+if __name__ == "__main__":
+    # Example usage
+    parser = argparse.ArgumentParser(description='Replace video_file_root path in JSON files inside a zip archive')
+    parser.add_argument('zip_file', type=str, help='Path to the zip file')
+    parser.add_argument('database_dir', type=str, help='Path to the database directory')
+    
+    args = parser.parse_args()
+    
+    zip_file = args.zip_file
+    database_dir = args.database_dir
+
+    replace_root_path(zip_file, database_dir)
\ No newline at end of file
diff --git a/reproduce/run_benchmark.py b/reproduce/run_benchmark.py
new file mode 100644
index 0000000..36d630d
--- /dev/null
+++ b/reproduce/run_benchmark.py
@@ -0,0 +1,57 @@
+import dvd.config as config
+import os
+import argparse
+import json
+from dvd.dvd_core import DVDCoreAgent
+from dvd.video_utils import load_video, decode_video_to_frames, download_srt_subtitle
+from dvd.frame_caption import process_video, process_video_lite
+from dvd.utils import extract_answer
+
+def main():
+    parser = argparse.ArgumentParser(description="Run DVDCoreAgent on a video.")
+    parser.add_argument("benchmark_database_folder", help="The path to the benchmark database folder.")
+    parser.add_argument("benchmark_metadata", help="The path to the benchmark metadata file.")
+    args = parser.parse_args()
+
+    benchmark_database_folder = args.benchmark_database_folder
+
+    with open(args.benchmark_metadata, "r") as f:
+        lines = f.readlines()
+
+    total_data = []
+    results = {}
+    for line in lines:
+        # one line for one video instance containing multiple questions
+        video_info = json.loads(line)
+        video_id = video_info["key"]
+        qa_list = video_info["qa"]
+
+        qids = [qa["uid"] for qa in qa_list]
+        questions = [qa["question"] for qa in qa_list]
+
+        frames_dir = os.path.join(benchmark_database_folder, video_id, "frames")
+        if not os.path.exists(frames_dir) or len(os.listdir(frames_dir)) == 0:
+            print(f"Frames for video {frames_dir} not found, skipping...")
+            continue
+        video_db_path = os.path.join(benchmark_database_folder, video_id, "database.json")
+
+        print(f"Initializing DVDCoreAgent from database {video_db_path}...")
+        agent = DVDCoreAgent(video_db_path, video_caption_path=None, max_iterations=15)
+        agent.messages[-1]['content'] += "\nSelect the best option that accurately addresses the question.\nAnswer with the option\'s letter from the given choices directly and only give the best option."
+        print("Agent initialized.")
+        # Run with questions
+        msgs = agent.parallel_run(questions, max_workers=4)
+        for qid, question, msg in zip(qids, questions, msgs):
+            answer = extract_answer(msg[-1])
+            results[qid] = {
+                "question": question,
+                "answer": answer,
+                "reasoning": msg
+            }
+
+    with open("benchmark_results.json", "w") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+
+if __name__ == "__main__":
+    main()
+

From 6e0247a4e30baf143e5dba5dd7135cf7c3318058 Mon Sep 17 00:00:00 2001
From: Ubuntu
 <azureuser@msraima100gpu009.sodzjw3wlboepoqhpte4arfkbg.px.internal.cloudapp.net>
Date: Wed, 15 Oct 2025 14:22:34 +0000
Subject: [PATCH 14/15] update reproduce README

---
 README.md                             | 2 +-
 reproduce/{REPRODUCE.md => README.md} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename reproduce/{REPRODUCE.md => README.md} (100%)

diff --git a/README.md b/README.md
index 805df16..332dd9c 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di
 
 ## Update
 
-- **2025/10/15**: Add a markdown to help to reproduce the LVBench results: [REPRODUCE.md](reproduce/REPRODUCE.md). Email me (xiaoyizhang@microsoft) if your issue get no response in 24 hours.
+- **2025/10/15**: Add a markdown to help to reproduce the LVBench results: [REPRODUCE.md](reproduce/README.md). Email me (xiaoyizhang [/at/] microsoft.com) if your issue gets no response in 24 hours.
 - **2025/09/19**: Accepted by NeurIPS 2025 🎉
 - **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6).
 - **2025/08/02**: Support auto subtitle in the demo.
diff --git a/reproduce/REPRODUCE.md b/reproduce/README.md
similarity index 100%
rename from reproduce/REPRODUCE.md
rename to reproduce/README.md

From 000ac61ad38fde9d4f4e3ef2399282f037abe228 Mon Sep 17 00:00:00 2001
From: anonymous626 <131758638+anonymous626@users.noreply.github.com>
Date: Mon, 3 Nov 2025 15:40:40 +0800
Subject: [PATCH 15/15] Add audio transcription scrip for reproduce

This script transcribes audio files from a specified directory using the WhisperX model, aligns the output, and saves the results in JSON format.
---
 reproduce/transcribe.py | 53 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 reproduce/transcribe.py

diff --git a/reproduce/transcribe.py b/reproduce/transcribe.py
new file mode 100644
index 0000000..790f72d
--- /dev/null
+++ b/reproduce/transcribe.py
@@ -0,0 +1,53 @@
+import json
+import os
+import whisperx
+import gc 
+from whisperx.alignment import DEFAULT_ALIGN_MODELS_TORCH, DEFAULT_ALIGN_MODELS_HF 
+
+device = "cuda" 
+batch_size = 64 # reduce if low on GPU mem
+compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
+
+# 1. Transcribe with original whisper (batched)
+model = whisperx.load_model("large-v3", device, compute_type=compute_type)
+# 3. Assign speaker labels
+HF_TOKEN = "YOUR_HF_TOKEN"
+# diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device)
+
+root = "./lvbench_vdb"
+for file in os.listdir(root):
+    # save model to local path (optional)
+    # model_dir = "/path/"
+    # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
+    
+    if not file.endswith(".mp3"):
+        continue
+    
+    audio_file = os.path.join(root, file)
+
+    if os.path.exists(audio_file.replace(".mp3", ".json")):
+        print(f"File {audio_file.replace('.mp3', '.json')} already exists, skipping...")
+        with open(audio_file.replace(".mp3", ".json"), "r") as f:
+            legacy_result = json.load(f)
+    else:
+        legacy_result = None
+
+    audio = whisperx.load_audio(audio_file)
+    result = model.transcribe(audio, batch_size=batch_size)
+
+    if result["language"] in DEFAULT_ALIGN_MODELS_TORCH or \
+        result["language"] in DEFAULT_ALIGN_MODELS_HF:
+        lang = result["language"]
+    else:
+        lang = 'en'
+        print(f"Language {result['language']} not supported, using English instead for {audio_file}.")
+
+    # 2. Align whisper output
+    model_a, metadata = whisperx.load_align_model(language_code=lang, device=device)
+    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+
+    with open(audio_file.replace(".mp3", ".json"), "w") as f:
+        json.dump(result, f, indent=4)
+    print(f"saved as {audio_file.replace('.mp3', '.json')}")
+
+