From afa10cc1cbaac1798e9984128c2e83b5c90ab73b Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Tue, 15 Jul 2025 23:54:39 -0700 Subject: [PATCH 01/15] support openai api --- README.md | 2 +- dvd/build_database.py | 6 +++ dvd/config.py | 4 ++ dvd/dvd_core.py | 1 + dvd/frame_caption.py | 2 + dvd/utils.py | 121 ++++++++++++++++++++++++++++-------------- 6 files changed, 94 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index bdb2e7d..3637ccd 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ The `local_run.py` script provides an example of how to run the Deep Video Disco ## TODO -- [ ] Support OpenAI API key configuration. +- [x] Support OpenAI API key configuration. - [ ] Implement MCP server. - [ ] Release evaluation trajectory data on long video benchmarks. diff --git a/dvd/build_database.py b/dvd/build_database.py index 5361f11..51a3db5 100644 --- a/dvd/build_database.py +++ b/dvd/build_database.py @@ -102,6 +102,7 @@ def frame_inspect_tool( messages=input_msgs, endpoints=config.AOAI_TOOL_VLM_ENDPOINT_LIST, model_name=config.AOAI_TOOL_VLM_MODEL_NAME, + api_key=config.OPENAI_API_KEY, image_paths=files, temperature=0, max_tokens=512, @@ -130,6 +131,7 @@ def clip_search_tool( endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST, model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME, input_text=[event_description], + api_key=config.OPENAI_API_KEY, )[0]['embedding'] results = database.query( query_emb, @@ -164,6 +166,7 @@ def global_browse_tool( endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST, model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME, input_text=[query], + api_key=config.OPENAI_API_KEY, )[0]['embedding'] results = database.query( query_emb, @@ -202,6 +205,7 @@ def global_browse_tool( messages=input_msgs, endpoints=config.AOAI_TOOL_VLM_ENDPOINT_LIST, model_name=config.AOAI_TOOL_VLM_MODEL_NAME, + api_key=config.OPENAI_API_KEY, temperature=0, max_tokens=512, ) @@ -318,6 +322,7 @@ def single_batch_embedding_task(data): endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST, model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME, input_text=captions, + api_key=config.OPENAI_API_KEY, ) max_tries = 3 while embs is None or len(embs) != len(captions): @@ -329,6 +334,7 @@ def single_batch_embedding_task(data): endpoints=config.AOAI_EMBEDDING_RESOURCE_LIST, model_name=config.AOAI_EMBEDDING_LARGE_MODEL_NAME, input_text=captions, + api_key=config.OPENAI_API_KEY, ) return list(zip(timestamps, cap_infos, [d['embedding'] for d in embs])) diff --git a/dvd/config.py b/dvd/config.py index e5e00fa..7bde7a5 100644 --- a/dvd/config.py +++ b/dvd/config.py @@ -1,3 +1,5 @@ +import os + # ------------------ video download and segmentation configuration ------------------ # VIDEO_DATABASE_FOLDER = "./video_database/" VIDEO_RESOLUTION = "360" # denotes the height of the video @@ -5,6 +7,8 @@ CLIP_SECS = 10 # seconds # ------------------ model configuration ------------------ # +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None) # will overwrite Azure OpenAI setting + AOAI_CAPTION_VLM_ENDPOINT_LIST = [""] AOAI_CAPTION_VLM_MODEL_NAME = "gpt-4.1-mini" diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py index 656d6b8..142af9c 100644 --- a/dvd/dvd_core.py +++ b/dvd/dvd_core.py @@ -139,6 +139,7 @@ def run(self, question) -> list[dict]: model_name=config.AOAI_ORCHESTRATOR_LLM_MODEL_NAME, tools=self.function_schemas, temperature=0.0, + api_key=config.OPENAI_API_KEY, ) if response is None: return None diff --git a/dvd/frame_caption.py b/dvd/frame_caption.py index eea8684..d0a36b8 100644 --- a/dvd/frame_caption.py +++ b/dvd/frame_caption.py @@ -233,6 +233,7 @@ def _caption_clip(task: Tuple[str, Dict], caption_ckpt_folder) -> Tuple[str, dic model_name=config.AOAI_CAPTION_VLM_MODEL_NAME, return_json=True, image_paths=files, + api_key=config.OPENAI_API_KEY, )["content"] if resp is None: continue @@ -269,6 +270,7 @@ def merge_subject_registries(registries: List[dict]) -> dict: endpoints=config.AOAI_CAPTION_VLM_ENDPOINT_LIST, model_name=config.AOAI_CAPTION_VLM_MODEL_NAME, return_json=True, + api_key=config.OPENAI_API_KEY, )["content"] if resp is None: continue diff --git a/dvd/utils.py b/dvd/utils.py index 0233a99..077cb28 100644 --- a/dvd/utils.py +++ b/dvd/utils.py @@ -76,6 +76,7 @@ def call_openai_model_with_tools( messages, endpoints, model_name, + api_key: str = None, tools: list = [], # List of tool definitions image_paths: list = [], max_tokens: int = 4096, @@ -83,22 +84,32 @@ def call_openai_model_with_tools( tool_choice: str = "auto", # Can be "auto", "none", or a specific tool return_json: bool = False, ) -> dict: - credential = AzureCliCredential() - token = credential.get_token('https://cognitiveservices.azure.com/') - headers = { - "Content-Type": "application/json", - 'Authorization': 'Bearer ' + token.token - } - if isinstance(endpoints, str): - endpoint = endpoints - elif isinstance(endpoints, list): - endpoint = random.choice(endpoints) + if api_key: + headers = { + "Content-Type": "application/json", + 'Authorization': 'Bearer ' + api_key + } + endpoint = "https://api.openai.com/v1" + url = f"{endpoint}/chat/completions" else: - raise ValueError("Endpoints must be a string or a list of strings.") + credential = AzureCliCredential() + token = credential.get_token('https://cognitiveservices.azure.com/') + headers = { + "Content-Type": "application/json", + 'Authorization': 'Bearer ' + token.token + } + if isinstance(endpoints, str): + endpoint = endpoints + elif isinstance(endpoints, list): + endpoint = random.choice(endpoints) + else: + raise ValueError("Endpoints must be a string or a list of strings.") + url = f"{endpoint}/openai/deployments/{model_name}/chat/completions?api-version=2025-03-01-preview" + model = model_name - url = f"{endpoint}/openai/deployments/{model}/chat/completions?api-version=2025-03-01-preview" payload = { + "model": model, "messages": copy.deepcopy(messages), # "reasoning_effort": reasoning_effort, } @@ -140,7 +151,7 @@ def call_openai_model_with_tools( class AzureOpenAIEmbeddingService: @staticmethod @retry_with_exponential_backoff - def get_embeddings(endpoints, model_name, input_text): + def get_embeddings(endpoints, model_name, input_text, api_key: str = None): """ Call Azure OpenAI Embedding service and get embeddings for the input text. @@ -150,27 +161,35 @@ def get_embeddings(endpoints, model_name, input_text): :param input_text: The text for which you want to generate embeddings. :return: The embeddings as a JSON response. """ - if isinstance(endpoints, str): - endpoint = endpoints - elif isinstance(endpoints, list): - endpoint = random.choice(endpoints) + if api_key: + headers = { + "Content-Type": "application/json", + 'Authorization': 'Bearer ' + api_key + } + endpoint = "https://api.openai.com/v1" + url = f"{endpoint}/embeddings" else: - raise ValueError("Endpoints must be a string or a list of strings.") + if isinstance(endpoints, str): + endpoint = endpoints + elif isinstance(endpoints, list): + endpoint = random.choice(endpoints) + else: + raise ValueError("Endpoints must be a string or a list of strings.") + # Define the URL for the embeddings endpoint + url = f"{endpoint}/openai/deployments/{model_name}/embeddings?api-version=2023-05-15" + + credential = AzureCliCredential() + token = credential.get_token('https://cognitiveservices.azure.com/') + headers = { + "Content-Type": "application/json", + 'Authorization': 'Bearer ' + token.token + } + model = model_name - if isinstance(endpoint, list): - endpoint = random.choice(endpoint) - # Define the URL for the embeddings endpoint - url = f"{endpoint}/openai/deployments/{model}/embeddings?api-version=2023-05-15" - - credential = AzureCliCredential() - token = credential.get_token('https://cognitiveservices.azure.com/') - headers = { - "Content-Type": "application/json", - 'Authorization': 'Bearer ' + token.token - } # Set up the payload for the request payload = { - "input": input_text + "input": input_text, + "model": model } # Make the request to the Azure OpenAI service @@ -217,14 +236,34 @@ def extract_answer(message: dict) -> str | None: if __name__ == "__main__": - call_openai_model_with_tools( - messages=[{"role": "user", "content": "Hello, how are you?"}], - endpoints=["https://msra-im-openai-eus2.openai.azure.com"], - model_name="o3", - tools=[], - image_paths=[], - max_tokens=4096, - temperature=0.0, - tool_choice="auto", - return_json=False, - ) \ No newline at end of file + # Example for Azure + # call_openai_model_with_tools( + # messages=[{"role": "user", "content": "Hello, how are you?"}], + # endpoints=["https://msra-im-openai-eus2.openai.azure.com"], + # model_name="o3", + # tools=[], + # image_paths=[], + # max_tokens=4096, + # temperature=0.0, + # tool_choice="auto", + # return_json=False, + # ) + + # Example for OpenAI + api_key = os.environ.get("OPENAI_API_KEY") + if api_key: + response = call_openai_model_with_tools( + messages=[{"role": "user", "content": "Hello, how are you?"}], + endpoints=None, # Not used for OpenAI + model_name="gpt-4o", + api_key=api_key, + tools=[], + image_paths=[], + max_tokens=4096, + temperature=0.0, + tool_choice="auto", + return_json=False, + ) + print(response) + else: + print("OPENAI_API_KEY environment variable not set.") \ No newline at end of file From 39bc683a2450964524cbfa752f850277cb580dc2 Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Wed, 16 Jul 2025 01:23:42 -0700 Subject: [PATCH 02/15] add lite mode to support audio-focused youtube video --- README.md | 6 ++++ dvd/config.py | 1 + dvd/dvd_core.py | 2 ++ dvd/frame_caption.py | 21 ++++++++++++++ dvd/video_utils.py | 47 ++++++++++++++++++++++++++++++-- local_run.py | 65 ++++++++++++++++++++++++++++---------------- 6 files changed, 116 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 3637ccd..7fcd417 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,12 @@ This repository contains the official implementation of the paper [Deep Video Di ![image](https://github.com/user-attachments/assets/ac1c7f0a-3c10-4c4c-88d1-7bfe0e2010e1) +## Update + +- **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis! +- **2025/07/14**: Support OpenAI API and Azure OpenAI API. +- **2025/07/08**: Initial release of the Deep Video Discovery codebase. + ## Introduction **Deep Video Discovery (DVD)** is a deep-research style question answering agent designed for understanding extra-long videos. Leveraging the powerful capabilities of large language models (LLMs), DVD effectively interprets and processes extensive video content to answer complex user queries. diff --git a/dvd/config.py b/dvd/config.py index 7bde7a5..0659d71 100644 --- a/dvd/config.py +++ b/dvd/config.py @@ -24,6 +24,7 @@ AOAI_EMBEDDING_LARGE_DIM = 3072 # ------------------ agent and tool setting ------------------ # +LITE_MODE = True # if True, only leverage srt subtitle, no pixel downloaded or pixel captioning GLOBAL_BROWSE_TOPK = 300 OVERWRITE_CLIP_SEARCH_TOPK = 0 # 0 means no overwrite and let agent decide diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py index 142af9c..c3418f9 100644 --- a/dvd/dvd_core.py +++ b/dvd/dvd_core.py @@ -26,6 +26,8 @@ def finish(answer: A[str, D("Answer to the user's question.")]) -> None: class DVDCoreAgent: def __init__(self, video_db_path, video_caption_path, max_iterations): self.tools = [frame_inspect_tool, clip_search_tool, global_browse_tool, finish] + if config.LITE_MODE: + self.tools.remove(frame_inspect_tool) self.name_to_function_map = {tool.__name__: tool for tool in self.tools} self.function_schemas = [ {"function": as_json_schema(func), "type": "function"} diff --git a/dvd/frame_caption.py b/dvd/frame_caption.py index d0a36b8..9ed7e0a 100644 --- a/dvd/frame_caption.py +++ b/dvd/frame_caption.py @@ -240,6 +240,8 @@ def _caption_clip(task: Tuple[str, Dict], caption_ckpt_folder) -> Tuple[str, dic try: assert isinstance(resp, str), f"Response must be a JSON string instead of {type(resp)}:{resp}." parsed = json.loads(resp) + parsed["clip_description"] += f"\n\nTranscript during this video clip: {transcript}." # add transcript to description + resp = json.dumps(parsed) with open(os.path.join(caption_ckpt_folder, f"{timestamp}.json"), "w") as f: f.write(resp) return timestamp, parsed @@ -329,6 +331,25 @@ def process_video( json.dump(frame_captions, f, indent=4) +def process_video_lite( + output_caption_folder: str, + subtitle_file_path: str, +): + """ + Process video in LITE_MODE using SRT subtitles. + """ + captions = parse_srt_to_dict(subtitle_file_path) + frame_captions = {} + for key, text in captions.items(): + frame_captions[key] = { + "caption": f"\n\nTranscript during this video clip: {text}.", + } + frame_captions["subject_registry"] = {} + with open( + os.path.join(output_caption_folder, "captions.json"), "w" + ) as f: + json.dump(frame_captions, f, indent=4) + # --------------------------------------------------------------------------- # # main # # --------------------------------------------------------------------------- # diff --git a/dvd/video_utils.py b/dvd/video_utils.py index 2b3840f..6660dfb 100644 --- a/dvd/video_utils.py +++ b/dvd/video_utils.py @@ -99,10 +99,51 @@ def load_video( ) shutil.copy2(subtitle_source, subtitle_destination) - return os.path.abspath(destination_path) +def download_srt_subtitle(video_url: str, output_path: str): + """Downloads an SRT subtitle from a YouTube URL.""" + if not _is_youtube_url(video_url): + raise ValueError("Provided URL is not a valid YouTube link.") + + output_dir = os.path.dirname(output_path) + os.makedirs(output_dir, exist_ok=True) + + ydl_opts = { + 'writesubtitles': True, + 'subtitleslangs': ['en'], + 'subtitlesformat': 'srt', + 'skip_download': True, + 'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'), + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(video_url, download=False) + video_id = info['id'] + ydl.download([video_url]) + + # Locate the downloaded subtitle file (yt-dlp names them as ..srt) + downloaded_subtitle_path = None + for f in os.listdir(output_dir): + if f.startswith(video_id) and f.endswith(".srt"): + downloaded_subtitle_path = os.path.join(output_dir, f) + break + + if downloaded_subtitle_path: + shutil.move(downloaded_subtitle_path, output_path) + else: + # Try auto-generated subtitles + ydl_opts['writeautomaticsub'] = True + with yt_dlp.YoutubeDL(ydl_opts) as ydl_auto: + ydl_auto.download([video_url]) + + if os.path.exists(os.path.join(output_dir, f"{video_id}.en.vtt")): + # yt-dlp might download as .vtt and convert, check for final .srt + for f in os.listdir(output_dir): + if f.startswith(video_id) and f.endswith('.srt'): + shutil.move(os.path.join(output_dir, f), output_path) + return + + raise FileNotFoundError(f"Could not find SRT subtitle for {video_url}") - # ------------------- Not found ------------------- - raise FileNotFoundError(f"Video source '{video_source}' not found or is not a valid URL.") def decode_video_to_frames(video_path: str) -> str: """ diff --git a/local_run.py b/local_run.py index c08f239..c74b0a1 100644 --- a/local_run.py +++ b/local_run.py @@ -2,8 +2,8 @@ import os import argparse from dvd.dvd_core import DVDCoreAgent -from dvd.video_utils import load_video, decode_video_to_frames -from dvd.frame_caption import process_video +from dvd.video_utils import load_video, decode_video_to_frames, download_srt_subtitle +from dvd.frame_caption import process_video, process_video_lite from dvd.utils import extract_answer def main(): @@ -27,31 +27,50 @@ def main(): frames_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "frames") captions_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "captions") video_db_path = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "database.json") + srt_path = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "subtitles.srt") - # Download video - if not os.path.exists(video_path): - print(f"Downloading video from {video_url} to {video_path}...") - load_video(video_url, video_path) - print("Video downloaded.") + if config.LITE_MODE: + print("Running in LITE_MODE.") + if not os.path.exists(srt_path): + print(f"Downloading SRT subtitle for {video_url} to {srt_path}...") + try: + download_srt_subtitle(video_url, srt_path) + print("SRT subtitle downloaded.") + except Exception as e: + print(f"Error downloading subtitle: {e}") + print("Please turn off LITE_MODE and try again.") + return + else: + print(f"SRT subtitle already exists at {srt_path}.") + + # In LITE_MODE, we use srt as caption file + process_video_lite(captions_dir, srt_path) + caption_file = os.path.join(captions_dir, "captions.json") else: - print(f"Video already exists at {video_path}.") + # Download video + if not os.path.exists(video_path): + print(f"Downloading video from {video_url} to {video_path}...") + load_video(video_url, video_path) + print("Video downloaded.") + else: + print(f"Video already exists at {video_path}.") - # Decode video to frames - if not os.path.exists(frames_dir) or not os.listdir(frames_dir): - print(f"Decoding video to frames in {frames_dir}...") - decode_video_to_frames(video_path) - print("Video decoded.") - else: - print(f"Frames already exist in {frames_dir}.") + # Decode video to frames + if not os.path.exists(frames_dir) or not os.listdir(frames_dir): + print(f"Decoding video to frames in {frames_dir}...") + decode_video_to_frames(video_path) + print("Video decoded.") + else: + print(f"Frames already exist in {frames_dir}.") - # Get captions - caption_file = os.path.join(captions_dir, "captions.json") - if not os.path.exists(caption_file): - print("Processing video to get captions...") - process_video(frames_dir, captions_dir) - print("Captions generated.") - else: - print(f"Captions already exist at {caption_file}.") + # Get captions + caption_file = os.path.join(captions_dir, "captions.json") + if not os.path.exists(caption_file): + print("Processing video to get captions...") + process_video(frames_dir, captions_dir) + print("Captions generated.") + else: + print(f"Captions already exist at {caption_file}.") # Initialize DVDCoreAgent print("Initializing DVDCoreAgent...") From 608f010c901e404e9248f47a87c922b807091811 Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Thu, 17 Jul 2025 02:36:11 -0700 Subject: [PATCH 03/15] add gradio demo --- .gitignore | 1 + README.md | 2 + app.py | 276 ++++++++++++++++++++++++++++++++++++++++++++++++ dvd/dvd_core.py | 56 ++++++++++ 4 files changed, 335 insertions(+) create mode 100644 app.py diff --git a/.gitignore b/.gitignore index e08cddb..4f09084 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ tmp/ video_database/ .git.bak/ +.gradio/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 7fcd417..27f0344 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079) [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) +[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](TODO) This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/). @@ -11,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di ## Update +- **2025/07/17**: Add gradio demo. - **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis! - **2025/07/14**: Support OpenAI API and Azure OpenAI API. - **2025/07/08**: Initial release of the Deep Video Discovery codebase. diff --git a/app.py b/app.py new file mode 100644 index 0000000..1a69102 --- /dev/null +++ b/app.py @@ -0,0 +1,276 @@ +import json +import os, argparse, gradio as gr +from dvd import config +from dvd.dvd_core import DVDCoreAgent +from dvd.video_utils import load_video, decode_video_to_frames, download_srt_subtitle +from dvd.frame_caption import process_video, process_video_lite +from dvd.utils import extract_answer + +######################################################################## +# Helper functions +######################################################################## +def get_youtube_thumbnail(video_url: str): + """Extract YouTube video ID and return thumbnail URL.""" + if not video_url: + return None + + # Extract video ID from YouTube URL + video_id = None + if "youtube.com/watch?v=" in video_url: + video_id = video_url.split("v=")[1].split("&")[0] + elif "youtu.be/" in video_url: + video_id = video_url.split("youtu.be/")[1].split("?")[0] + + if video_id: + # YouTube provides several thumbnail qualities + # maxresdefault > hqdefault > mqdefault > default + return f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg" + + return None + +def _prepare_video_assets(video_url: str): + """Download / decode / caption the video exactly as in local_run.py, + returning (video_id, caption_file, video_db_path).""" + # --- reuse logic from local_run.py (trimmed for brevity) ------------- + if "v=" in video_url: # YouTube URL + video_id = video_url.split("v=")[1] + else: # local file or misc. + video_id = os.path.splitext(os.path.basename(video_url))[0] + + video_path = os.path.join(config.VIDEO_DATABASE_FOLDER, "raw", f"{video_id}.mp4") + frames_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "frames") + captions_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "captions") + video_db_path= os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "database.json") + srt_path = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "subtitles.srt") + + if config.LITE_MODE: + if not os.path.exists(srt_path): + download_srt_subtitle(video_url, srt_path) + process_video_lite(captions_dir, srt_path) + caption_file = os.path.join(captions_dir, "captions.json") + else: + if not os.path.exists(video_path): + load_video(video_url, video_path) + if not os.path.exists(frames_dir) or not os.listdir(frames_dir): + decode_video_to_frames(video_path) + caption_file = os.path.join(captions_dir, "captions.json") + if not os.path.exists(caption_file): + process_video(frames_dir, captions_dir) + + return video_id, caption_file, video_db_path + +def solve(video_url: str, question: str): + """Streamed inference function used by Gradio.""" + if not video_url or not question: + yield "❗ Please provide both a video URL and a question." + return + + try: + yield "🔄 **Processing video...**" + _, caption_file, video_db_path = _prepare_video_assets(video_url) + + yield "🤖 **Initializing DVD agent...**" + agent = DVDCoreAgent(video_db_path, caption_file, config.MAX_ITERATIONS) + + accumulated_text = "### 🎯 Analysis Process:\n" + final_answer = None + + for msg in agent.stream_run(question): + # Only process messages with a role attribute + if not isinstance(msg, dict) or "role" not in msg: + continue + + # Show assistant's thinking process + if msg.get("role") == "assistant": + content = msg.get("content", "") + if content: + accumulated_text += f"\n\n**🤔 Assistant Thinking:**\n{content}" + yield accumulated_text + + # Check if assistant called the finish function + tool_calls = msg.get("tool_calls", []) + for tc in tool_calls: + if tc.get("function", {}).get("name") == "finish": + try: + args = json.loads(tc.get("function", {}).get("arguments", "{}")) + final_answer = args.get("answer", "") + except: + pass + + # Show when a tool is being called + elif msg.get("role") == "tool_call": + tool_name = msg.get("name", "unknown") + tool_args = msg.get("arguments", "{}") + try: + args_dict = json.loads(tool_args) + args_dict.pop("database", None) + # Format arguments nicely + args_str = json.dumps(args_dict, indent=2) + except: + args_str = tool_args + if tool_name != "finish": + accumulated_text += f"\n\n**🔄 Calling Tool:** `{tool_name}`\n```json\n{args_str}\n```" + yield accumulated_text + + # Show tool observations + elif msg.get("role") == "tool": + tool_name = msg.get("name", "unknown") + tool_result = msg.get("content", "") + + # Truncate long results for display + if len(tool_result) > 2000: + tool_result = tool_result[:2000] + "..." + + accumulated_text += f"\n\n**✅ Tool Result `{tool_name}`:**\n```\n{tool_result}\n```" + yield accumulated_text + + # Add final answer if found + if final_answer: + accumulated_text += f"### ✅📃 **Final Answer:**\n\n{final_answer}" + else: + accumulated_text += "\n\n---\n### ✅ **Analysis Complete!**" + + yield accumulated_text + + except Exception as e: + import traceback + yield f"### ⚠️ Error Occurred\n\n```\n{e}\n```\n\nDetails:\n```\n{traceback.format_exc()}\n```" + +######################################################################## +# Gradio UI +######################################################################## +def launch(args): + # Custom CSS for better styling + custom_css = """ + .gradio-container { + font-family: 'Inter', sans-serif; + } + .markdown-text { + font-size: 16px; + } + #answer-box { + border: 2px solid #e5e7eb; + border-radius: 8px; + padding: 20px; + background-color: #f9fafb; + min-height: 400px; + max-height: 600px; + overflow-y: auto; + } + .button-primary { + background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); + color: white; + font-weight: bold; + font-size: 18px; + padding: 12px 24px; + } + #video-thumbnail { + border-radius: 8px; + box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1); + } + """ + + with gr.Blocks(title="DVD Video Q&A Demo", css=custom_css, theme=gr.themes.Soft()) as demo: + gr.Markdown( + """ + # 🎬 Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding + +

+ Provide a YouTube URL, then ask any question about the video content. + The system will analyze the video and provide detailed answers. + Note that this online demo only provides lite mode of DVD where only subtitles are used. + To use full DVD capabilities, please deploy it locally. +

+ """ + ) + + with gr.Row(): + with gr.Column(scale=1): + gr.Markdown("### 📹 Video Input") + video_url = gr.Textbox( + label="Video URL / Path", + placeholder="e.g. https://www.youtube.com/watch?v=dQw4w9WgXcQ", + lines=1, + info="Support YouTube URLs or local video paths" + ) + + # Add video thumbnail + video_thumbnail = gr.Image( + label="Video Thumbnail", + elem_id="video-thumbnail", + height=200, + visible=False, + interactive=False + ) + + gr.Markdown("### ❓ Your Question") + question = gr.Textbox( + label="Question about the video", + placeholder="What happens in this video? Who are the main characters?", + lines=3, + info="Ask anything about the video content" + ) + + with gr.Row(): + run_btn = gr.Button("🔍 Analyze Video", variant="primary", elem_classes=["button-primary"]) + clear_btn = gr.ClearButton([video_url, question, video_thumbnail], value="🗑️ Clear") + + gr.Markdown("### 💡 Example Questions") + examples = gr.Examples( + examples=[ + ["https://www.youtube.com/watch?v=i2qSxMVeVLI", "What is the main topic discussed in this video?"], + ["https://www.youtube.com/watch?v=nOxKexn3iBo", "Who are the speakers and what are their key points?"], + ], + inputs=[video_url, question], + label="" + ) + + with gr.Column(scale=2): + gr.Markdown("### 📊 Analysis Results") + answer_box = gr.Markdown( + value="*Results will appear here after clicking 'Analyze Video'...*", + elem_id="answer-box", + label="" + ) + + gr.Markdown( + """ + --- +

+ DVD: Powered by advanced video understanding and language models | + GitHub +

+ """ + ) + + # Event handlers + def update_thumbnail(url): + """Update thumbnail when URL changes.""" + thumbnail_url = get_youtube_thumbnail(url) + if thumbnail_url: + return gr.update(value=thumbnail_url, visible=True) + else: + return gr.update(value=None, visible=False) + + video_url.change( + fn=update_thumbnail, + inputs=[video_url], + outputs=[video_thumbnail] + ) + + import inspect + click_kwargs = dict(fn=solve, inputs=[video_url, question], outputs=answer_box) + if "stream" in inspect.signature(gr.Button.click).parameters: + click_kwargs["stream"] = True + run_btn.click(**click_kwargs) + + demo.launch(share=args.share) + +######################################################################## +# CLI entry-point (optional) +######################################################################## +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--share", action="store_true", help="Gradio share flag") + args = parser.parse_args() + launch(args) \ No newline at end of file diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py index c3418f9..630d181 100644 --- a/dvd/dvd_core.py +++ b/dvd/dvd_core.py @@ -158,6 +158,62 @@ def run(self, question) -> list[dict]: return msgs + # ------------------------------------------------------------------ # + # Streaming (generator) loop + # ------------------------------------------------------------------ # + def stream_run(self, question): + """ + A generator version of `run`. + Yields: + dict: every assistant / tool message produced during reasoning. + """ + msgs = copy.deepcopy(self.messages) + msgs[-1]["content"] = msgs[-1]["content"].replace("QUESTION_PLACEHOLDER", question) + + for i in range(self.max_iterations): + # Force a final `finish` on the last iteration + if i == self.max_iterations - 1: + final_usr_msg = { + "role": "user", + "content": "Please call the `finish` function to finish the task.", + } + msgs.append(final_usr_msg) + # Don't yield user messages to the UI + + response = call_openai_model_with_tools( + msgs, + endpoints=config.AOAI_ORCHESTRATOR_LLM_ENDPOINT_LIST, + model_name=config.AOAI_ORCHESTRATOR_LLM_MODEL_NAME, + tools=self.function_schemas, + temperature=0.0, + api_key=config.OPENAI_API_KEY, + ) + if response is None: + return + + response.setdefault("role", "assistant") + msgs.append(response) + yield response # ← stream assistant reply + + # Execute any requested tool calls + try: + for tool_call in response.get("tool_calls", []): + # Yield a formatted message about the tool being called + tool_name = tool_call.get("function", {}).get("name", "unknown") + tool_args = tool_call.get("function", {}).get("arguments", "{}") + yield { + "role": "tool_call", + "name": tool_name, + "arguments": tool_args + } + + self._exec_tool(tool_call, msgs) + # Only yield the tool result message + if msgs[-1].get("role") == "tool": + yield msgs[-1] # ← stream tool observation + except StopException: + return + def single_run_wrapper(info) -> dict: qid, video_db_path, video_caption_path, question = info From a74611ebfee7cd2aff4e06f93cb53a07c4504302 Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Thu, 17 Jul 2025 03:58:17 -0700 Subject: [PATCH 04/15] update README and demo link --- README.md | 2 +- app.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 27f0344..8826ee6 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079) [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) -[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](TODO) +[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://2b295b16087857c68f.gradio.live/) This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/). diff --git a/app.py b/app.py index 1a69102..d7fcf0a 100644 --- a/app.py +++ b/app.py @@ -42,6 +42,9 @@ def _prepare_video_assets(video_url: str): captions_dir = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "captions") video_db_path= os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "database.json") srt_path = os.path.join(config.VIDEO_DATABASE_FOLDER, video_id, "subtitles.srt") + os.makedirs(os.path.join(config.VIDEO_DATABASE_FOLDER, "raw"), exist_ok=True) + os.makedirs(frames_dir, exist_ok=True) + os.makedirs(captions_dir, exist_ok=True) if config.LITE_MODE: if not os.path.exists(srt_path): @@ -126,7 +129,7 @@ def solve(video_url: str, question: str): # Add final answer if found if final_answer: - accumulated_text += f"### ✅📃 **Final Answer:**\n\n{final_answer}" + accumulated_text += f"\n### 📃✅ **Final Answer:**\n\n{final_answer}" else: accumulated_text += "\n\n---\n### ✅ **Analysis Complete!**" From 4c879e245e4dece1a723a9a3700ec656a874b76d Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Mon, 21 Jul 2025 05:47:29 -0700 Subject: [PATCH 05/15] update requirements --- README.md | 5 +++++ requirements.txt | 8 ++++++++ 2 files changed, 13 insertions(+) create mode 100644 requirements.txt diff --git a/README.md b/README.md index 8826ee6..10c2730 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,11 @@ The core design of DVD includes: pip install -r requirements.txt ``` +3. (Optional) **Install gradio for demo:** + ```bash + pip install gradio + ``` + ## Usage Note: Set up your configuration by updating the variables in `config.py`. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9b7d3e0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +numpy<2 +opencv-python-headless +tqdm +nano-vectordb +azure-identity +requests +yt-dlp +openai From 95740e5f3f1f9305dfccf094567f280fde13d303 Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Wed, 23 Jul 2025 10:42:02 -0700 Subject: [PATCH 06/15] update demo link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 10c2730..7cb5578 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079) [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) -[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://2b295b16087857c68f.gradio.live/) +[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://e1b3f5d6045818fdc6.gradio.live) This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/). From 217b9e2d85ba54916605f9f2cdd190414d8deeca Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Thu, 31 Jul 2025 03:55:59 -0700 Subject: [PATCH 07/15] update README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7cb5578..70bac8b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079) [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) -[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://e1b3f5d6045818fdc6.gradio.live) +[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://71d30852e95c6c7dd9.gradio.live) This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/). From 3cc2495e207c9a2f98433ce8644cd1aa0f81019f Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Fri, 1 Aug 2025 08:47:21 -0700 Subject: [PATCH 08/15] fix auto subtitle bug --- README.md | 2 +- dvd/video_utils.py | 16 ++-------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 70bac8b..7087c1a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![arXiv](https://img.shields.io/badge/arXiv-2504.16082-A42C25?style=flat&logo=arXiv&logoColor=A42C25)](https://arxiv.org/abs/2505.18079) [![License](https://img.shields.io/badge/License-MIT-red.svg)](https://opensource.org/licenses/MIT) -[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://71d30852e95c6c7dd9.gradio.live) +[![Hugging Face Spaces](https://img.shields.io/badge/Spaces-Demo-blueviolet?logo=huggingface&logoColor=white)](https://87fc7dc81d4b38ed01.gradio.live) This repository contains the official implementation of the paper [Deep Video Discovery: Agentic Search with Tool Use for Long-form Video Understanding](https://arxiv.org/abs/2505.18079), which achieves the state-of-the-art performance by a large margin on multiple long video benchmarks including the challenging [LVBench](https://lvbench.github.io/). diff --git a/dvd/video_utils.py b/dvd/video_utils.py index 6660dfb..7f6c891 100644 --- a/dvd/video_utils.py +++ b/dvd/video_utils.py @@ -109,9 +109,9 @@ def download_srt_subtitle(video_url: str, output_path: str): ydl_opts = { 'writesubtitles': True, - 'subtitleslangs': ['en'], 'subtitlesformat': 'srt', 'skip_download': True, + 'writeautomaticsub': True, 'outtmpl': os.path.join(output_dir, '%(id)s.%(ext)s'), } @@ -130,18 +130,6 @@ def download_srt_subtitle(video_url: str, output_path: str): if downloaded_subtitle_path: shutil.move(downloaded_subtitle_path, output_path) else: - # Try auto-generated subtitles - ydl_opts['writeautomaticsub'] = True - with yt_dlp.YoutubeDL(ydl_opts) as ydl_auto: - ydl_auto.download([video_url]) - - if os.path.exists(os.path.join(output_dir, f"{video_id}.en.vtt")): - # yt-dlp might download as .vtt and convert, check for final .srt - for f in os.listdir(output_dir): - if f.startswith(video_id) and f.endswith('.srt'): - shutil.move(os.path.join(output_dir, f), output_path) - return - raise FileNotFoundError(f"Could not find SRT subtitle for {video_url}") @@ -192,4 +180,4 @@ def decode_video_to_frames(video_path: str) -> str: return os.path.abspath(frames_dir) if __name__ == "__main__": - decode_video_to_frames("/home/xiaoyizhang/DVD/video_database/raw/i2qSxMVeVLI.mp4") \ No newline at end of file + download_srt_subtitle("https://www.youtube.com/watch?v=PQFQ-3d2J-8", "./video_database/PQFQ-3d2J-8/subtitles.srt") \ No newline at end of file From f0b6e33b0d6c7b2e1e4a574071de7e6caedf17e0 Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Fri, 1 Aug 2025 09:08:52 -0700 Subject: [PATCH 09/15] update README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7087c1a..5a5a042 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di ## Update +- **2025/08/02**: Support auto subtitle in the demo. - **2025/07/17**: Add gradio demo. - **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis! - **2025/07/14**: Support OpenAI API and Azure OpenAI API. From b608f8db8e9eccc685a3726de374d00dbeb49730 Mon Sep 17 00:00:00 2001 From: Xiaoyi Date: Mon, 4 Aug 2025 01:52:07 -0700 Subject: [PATCH 10/15] upload captions --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5a5a042..2ea80bc 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di ## Update +- **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6). - **2025/08/02**: Support auto subtitle in the demo. - **2025/07/17**: Add gradio demo. - **2025/07/16**: Add `lite_mode` to enable a lightweight version of the agent that uses only subtitles. Good for Youtube podcast analysis! From 60217809751c892b3fd548163b57a147b84bd715 Mon Sep 17 00:00:00 2001 From: anonymous626 <131758638+anonymous626@users.noreply.github.com> Date: Wed, 6 Aug 2025 14:53:31 +0800 Subject: [PATCH 11/15] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2ea80bc..20eae2d 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Note: Set up your configuration by updating the variables in `config.py`. The `local_run.py` script provides an example of how to run the Deep Video Discovery agent by providing a youtube url and question about it. ```bash - python local_run.py https://www.youtube.com/watch?v=ktbGziZlt3c "how many animals appear in this video" + python local_run.py https://www.youtube.com/watch?v=PQFQ-3d2J-8 "what did the main speaker talk about in the last part of video?" ``` ## TODO From 60ed1567ff098391cc158966e5fa8af004cedb63 Mon Sep 17 00:00:00 2001 From: anonymous626 <131758638+anonymous626@users.noreply.github.com> Date: Fri, 19 Sep 2025 15:41:09 +0800 Subject: [PATCH 12/15] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 20eae2d..58aaebf 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di ## Update +- **2025/09/19**: Accepted by NeurIPS 2025 🎉 - **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6). - **2025/08/02**: Support auto subtitle in the demo. - **2025/07/17**: Add gradio demo. From 68ad71428545b690224094bcbd6e66faa0ce18f1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 15 Oct 2025 14:14:16 +0000 Subject: [PATCH 13/15] update reproduce guide --- README.md | 1 + dvd/dvd_core.py | 21 ++++ dvd/utils.py | 9 +- reproduce/REPRODUCE.md | 29 +++++ reproduce/decode_frames.py | 221 ++++++++++++++++++++++++++++++++++ reproduce/download_lvbench.sh | 5 + reproduce/prepare_database.py | 52 ++++++++ reproduce/run_benchmark.py | 57 +++++++++ 8 files changed, 391 insertions(+), 4 deletions(-) create mode 100644 reproduce/REPRODUCE.md create mode 100644 reproduce/decode_frames.py create mode 100644 reproduce/download_lvbench.sh create mode 100644 reproduce/prepare_database.py create mode 100644 reproduce/run_benchmark.py diff --git a/README.md b/README.md index 58aaebf..805df16 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di ## Update +- **2025/10/15**: Add a markdown to help to reproduce the LVBench results: [REPRODUCE.md](reproduce/REPRODUCE.md). Email me (xiaoyizhang@microsoft) if your issue get no response in 24 hours. - **2025/09/19**: Accepted by NeurIPS 2025 🎉 - **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6). - **2025/08/02**: Support auto subtitle in the demo. diff --git a/dvd/dvd_core.py b/dvd/dvd_core.py index 630d181..db5f014 100644 --- a/dvd/dvd_core.py +++ b/dvd/dvd_core.py @@ -9,6 +9,7 @@ from dvd.func_call_shema import as_json_schema from dvd.func_call_shema import doc as D from dvd.utils import call_openai_model_with_tools +from concurrent.futures import ThreadPoolExecutor, as_completed TOPK = 16 @@ -158,6 +159,26 @@ def run(self, question) -> list[dict]: return msgs + def parallel_run(self, questions, max_workers=4) -> list[list[dict]]: + """ + Run multiple questions in parallel. + """ + results = [] + results = [None] * len(questions) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_index = { + executor.submit(self.run, q): idx + for idx, q in enumerate(questions) + } + for future in as_completed(future_to_index): + idx = future_to_index[future] + try: + results[idx] = future.result() + except Exception as e: + print(f"Error processing question: {e}") + results[idx] = None + return results + # ------------------------------------------------------------------ # # Streaming (generator) loop # ------------------------------------------------------------------ # diff --git a/dvd/utils.py b/dvd/utils.py index 077cb28..663bbf4 100644 --- a/dvd/utils.py +++ b/dvd/utils.py @@ -222,16 +222,17 @@ def extract_answer(message: dict) -> str | None: str | None The extracted answer, or ``None`` if no answer could be found. """ - # Direct text response - if (content := message.get("content")): - return content.strip() - # Tool-based response for call in message.get("tool_calls", []): args_json = call["function"]["arguments"] args = json.loads(args_json) if (answer := args.get("answer")): return answer + + # Direct text response + if (content := message.get("content")): + return content.strip() + return None diff --git a/reproduce/REPRODUCE.md b/reproduce/REPRODUCE.md new file mode 100644 index 0000000..53d547d --- /dev/null +++ b/reproduce/REPRODUCE.md @@ -0,0 +1,29 @@ +# Reproduce + +0. Setup database root path with `export DATABASE_DIR=/path/to/your/database/folder`. + +1. Download the pre-built database from [here](https://huggingface.co/datasets/xyzhang626/LongVideoBenchmarkCaptions/tree/main). Or you can use the script `wget https://huggingface.co/datasets/xyzhang626/LongVideoBenchmarkCaptions/resolve/main/LVBench_4.1.zip` to download the database. + +2. Prepare the database json files. You can use the script in `prepare_lvbench_db.py` to prepare the database json files. Please modify the path to your downloaded LVBench database. It will generate the database json files into `$DATABASE_DIR/LVBench_4.1`. + +```bash +python -m reproduce.prepare_database /path/to/your/zipfile $DATABASE_DIR +``` + +3. Download LVBench dataset, you could find this 3rd party assets in [here](https://huggingface.co/datasets/AIWinter/LVBench/tree/main). Or you can use the script to download the dataset. + +```bash +export TARGET_DIR=$DATABASE_DIR/LVBench_4.1 +bash reproduce/download_lvbench.sh +``` + +4. Decode the videos into raw frames, you could use the script in `decode_frames.py`, please modify the path to your downloaded LVBench dataset. + +```bash +python -m reproduce.decode_frames --part $DATABASE_DIR/LVBench_4.1/all_videos_split.zip.001 --out $TARGET_DIR --fps 2 +``` + +5. Run the benchmark. You can use the script in `run_benchmark.py` to run the benchmark. Please modify the path to your prepared database json files. +```bash +python -m reproduce.run_benchmark $TARGET_DIR $TARGET_DIR/video_info.meta.jsonl +``` diff --git a/reproduce/decode_frames.py b/reproduce/decode_frames.py new file mode 100644 index 0000000..6788b49 --- /dev/null +++ b/reproduce/decode_frames.py @@ -0,0 +1,221 @@ +import argparse +import os +import re +import sys +import tempfile +import zipfile +import shutil +import logging +from pathlib import Path +from typing import List, Iterable +import cv2 +from tqdm import tqdm +import multiprocessing as mp + +VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.webm', '.flv', '.mpg', '.mpeg', '.m4v'} + + +def find_all_parts(part_path: Path) -> List[Path]: + """ + Given any split archive file, find all parts in the same directory and return them in order. + Supports *.zip.001 / *.zip.002 or *.z01 / *.z02 + .zip formats (compatible with common naming). + """ + name = part_path.name + parent = part_path.parent + + # Match something like something.zip.001 or something.zip.002 + m = re.match(r'(.+\.zip)\.(\d{3})$', name) + if m: + base = m.group(1) + parts = sorted(parent.glob(base + ".???")) + return parts + + # Match WinZip style: something.z01, something.z02 ... + something.zip + m2 = re.match(r'(.+)\.z(\d{2})$', name, re.IGNORECASE) + if m2: + base_prefix = m2.group(1) + zparts = sorted(parent.glob(base_prefix + ".z??"), key=lambda p: p.suffix.lower()) + main_zip = parent / (base_prefix + ".zip") + if main_zip.exists(): + return zparts + [main_zip] + + # If it's a complete zip file + if name.endswith(".zip"): + return [part_path] + + raise ValueError(f"Unrecognized split archive naming: {name}") + + +def assemble_zip(parts: List[Path]) -> Path: + """ + Concatenate all parts in order into a temporary zip file and return its path. + If there is only one .zip file, return its path directly (no copy). + """ + if len(parts) == 1 and parts[0].suffix == ".zip": + return parts[0] + + tmp_fd, tmp_path = tempfile.mkstemp(suffix=".zip", prefix="merged_zip_") + os.close(tmp_fd) + with open(tmp_path, "wb") as w: + for p in parts: + logging.info(f"Merging part: {p.name}") + with open(p, "rb") as r: + shutil.copyfileobj(r, w, length=1024 * 1024) + return Path(tmp_path) + + +def iter_video_members(zf: zipfile.ZipFile) -> Iterable[zipfile.ZipInfo]: + for info in zf.infolist(): + if info.is_dir(): + continue + ext = Path(info.filename).suffix.lower() + if ext in VIDEO_EXTS: + yield info + + +def ensure_dir(path: Path): + path.mkdir(parents=True, exist_ok=True) + + +def decode_video(temp_video_path: Path, out_root: Path, fps: float, overwrite: bool = False, video_stem: str | None = None): + """ + Extract frames at the given fps and save them. + video_stem: Pass the original video filename (without extension) to avoid using the temporary filename. + """ + cap = cv2.VideoCapture(str(temp_video_path)) + if not cap.isOpened(): + logging.error(f"Cannot open video: {temp_video_path}") + return + + orig_fps = cap.get(cv2.CAP_PROP_FPS) or 0 + total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0) + if orig_fps <= 0: + logging.warning(f"Original FPS abnormal ({orig_fps}), will estimate by frame time.") + orig_fps = fps # fallback + + interval = 1.0 / fps + next_t = 0.0 + frame_index = 0 + saved_index = 0 + + # Use original filename (if provided) + video_stem = video_stem or temp_video_path.stem + frames_dir = out_root / video_stem / "frames" + ensure_dir(frames_dir) + + if not overwrite: + # Count existing frames, auto-continue + existing = sorted(frames_dir.glob("frames_n*.jpg")) + if existing: + last = existing[-1].stem + m = re.search(r'frames_n(\d+)', last) + if m: + saved_index = int(m.group(1)) + logging.info(f"{video_stem}: Append mode, {saved_index} frames already exist.") + + pbar = tqdm(total=total_frames if total_frames > 0 else None, + desc=f"Decoding {video_stem}", + unit="f", + dynamic_ncols=True) + + while True: + ret, frame = cap.read() + if not ret: + break + current_t = frame_index / orig_fps + if current_t + 1e-6 >= next_t: + saved_index += 1 + out_path = frames_dir / f"frames_n{saved_index:06d}.jpg" + if overwrite or not out_path.exists(): + cv2.imwrite(str(out_path), frame, [int(cv2.IMWRITE_JPEG_QUALITY), 95]) + next_t += interval + frame_index += 1 + pbar.update(1) + pbar.close() + cap.release() + logging.info(f"{video_stem}: Frame extraction complete, total (including existing) {saved_index} frames.") + + +def process_archive(part_path: Path, out_root: Path, fps: float, overwrite: bool): + parts = find_all_parts(part_path) + logging.info("Found parts in order: " + ", ".join(p.name for p in parts)) + merged_zip = assemble_zip(parts) + cleanup_needed = merged_zip not in parts # If we generated a temporary file + try: + with zipfile.ZipFile(merged_zip, 'r') as zf: + video_members = list(iter_video_members(zf)) + logging.info(f"Number of video files in archive: {len(video_members)}") + temp_files = [] # (process, tmp_path) + max_workers = min(len(video_members), mp.cpu_count() or 1) + logging.info(f"Parallel decoding: using {max_workers} processes") + + def wait_one(): + # Wait for the earliest started process to finish and clean up temp file + proc, tpath = temp_files.pop(0) + proc.join() + try: + tpath.unlink(missing_ok=True) + except Exception: + pass + + for info in video_members: + # Write this video to a temporary file + suffix = Path(info.filename).suffix + with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: + tmp.write(zf.read(info)) + tmp_path = Path(tmp.name) + + original_stem = Path(info.filename).stem + # Start a separate process for decoding, passing the original filename + p = mp.Process(target=decode_video, args=(tmp_path, out_root, fps), kwargs={'overwrite': overwrite, 'video_stem': original_stem}) + p.start() + temp_files.append((p, tmp_path)) + + # If reached parallel limit, wait for the earliest one to finish + if len(temp_files) >= max_workers: + wait_one() + + # Wait for all remaining to finish + while temp_files: + wait_one() + finally: + if cleanup_needed: + try: + merged_zip.unlink(missing_ok=True) + except Exception: + pass + + +def parse_args(): + ap = argparse.ArgumentParser(description="Extract video frames from split zip archives at a given fps") + ap.add_argument("--part", required=True, help="Path to any split archive file (e.g. all_videos_split.zip.002)") + ap.add_argument("--out", required=True, help="Output root directory") + ap.add_argument("--fps", type=float, required=True, help="Target frame extraction rate (e.g. 5)") + ap.add_argument("--overwrite", action="store_true", help="Overwrite existing frames") + ap.add_argument("--log-level", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"]) + return ap.parse_args() + +def main(): + args = parse_args() + logging.basicConfig( + level=getattr(logging, args.log_level), + format="%(asctime)s [%(levelname)s] %(message)s", + ) + + part_path = Path(args.part).expanduser().resolve() + out_root = Path(args.out).expanduser().resolve() + ensure_dir(out_root) + + if args.fps <= 0: + logging.error("fps must be > 0") + sys.exit(1) + + if not part_path.exists(): + logging.error(f"File does not exist: {part_path}") + sys.exit(1) + + process_archive(part_path, out_root, args.fps, overwrite=args.overwrite) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/reproduce/download_lvbench.sh b/reproduce/download_lvbench.sh new file mode 100644 index 0000000..999d999 --- /dev/null +++ b/reproduce/download_lvbench.sh @@ -0,0 +1,5 @@ +mkdir -p "$TARGET_DIR" +printf '%03d\n' {3..14} | \ +wget --continue -P "$TARGET_DIR" "https://huggingface.co/datasets/zai-org/LVBench/resolve/main/video_info.meta.jsonl" +xargs -P 8 -I{} wget --continue -P "$TARGET_DIR" \ + "https://huggingface.co/datasets/AIWinter/LVBench/resolve/main/all_videos_split.zip.{}" diff --git a/reproduce/prepare_database.py b/reproduce/prepare_database.py new file mode 100644 index 0000000..d37230e --- /dev/null +++ b/reproduce/prepare_database.py @@ -0,0 +1,52 @@ +import os +import json +import zipfile +from pathlib import Path +import argparse + +def replace_root_path(zip_file_path, database_dir): + """ + Read a zip file, replace 'video_file_root' in JSON files, and save to the specified directory. + + Args: + zip_file_path: Path to the zip file. + database_dir: Directory for the database. + """ + zip_file_name = Path(zip_file_path).stem + + with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: + # Iterate through all files in the zip + for file_name in zip_ref.namelist(): + if file_name.endswith('.json'): + # Read the JSON file + with zip_ref.open(file_name) as json_file: + data = json.load(json_file) + + # Replace video_file_root + new_root = os.path.join(database_dir, zip_file_name) + data['video_file_root'] = new_root + + # Create output directory + json_name = Path(file_name).stem + output_dir = os.path.join(database_dir, zip_file_name, json_name) + os.makedirs(output_dir, exist_ok=True) + + # Save the JSON file + output_path = os.path.join(output_dir, 'database.json') + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + print(f"Processed: {file_name} -> {output_path}") + +if __name__ == "__main__": + # Example usage + parser = argparse.ArgumentParser(description='Replace video_file_root path in JSON files inside a zip archive') + parser.add_argument('zip_file', type=str, help='Path to the zip file') + parser.add_argument('database_dir', type=str, help='Path to the database directory') + + args = parser.parse_args() + + zip_file = args.zip_file + database_dir = args.database_dir + + replace_root_path(zip_file, database_dir) \ No newline at end of file diff --git a/reproduce/run_benchmark.py b/reproduce/run_benchmark.py new file mode 100644 index 0000000..36d630d --- /dev/null +++ b/reproduce/run_benchmark.py @@ -0,0 +1,57 @@ +import dvd.config as config +import os +import argparse +import json +from dvd.dvd_core import DVDCoreAgent +from dvd.video_utils import load_video, decode_video_to_frames, download_srt_subtitle +from dvd.frame_caption import process_video, process_video_lite +from dvd.utils import extract_answer + +def main(): + parser = argparse.ArgumentParser(description="Run DVDCoreAgent on a video.") + parser.add_argument("benchmark_database_folder", help="The path to the benchmark database folder.") + parser.add_argument("benchmark_metadata", help="The path to the benchmark metadata file.") + args = parser.parse_args() + + benchmark_database_folder = args.benchmark_database_folder + + with open(args.benchmark_metadata, "r") as f: + lines = f.readlines() + + total_data = [] + results = {} + for line in lines: + # one line for one video instance containing multiple questions + video_info = json.loads(line) + video_id = video_info["key"] + qa_list = video_info["qa"] + + qids = [qa["uid"] for qa in qa_list] + questions = [qa["question"] for qa in qa_list] + + frames_dir = os.path.join(benchmark_database_folder, video_id, "frames") + if not os.path.exists(frames_dir) or len(os.listdir(frames_dir)) == 0: + print(f"Frames for video {frames_dir} not found, skipping...") + continue + video_db_path = os.path.join(benchmark_database_folder, video_id, "database.json") + + print(f"Initializing DVDCoreAgent from database {video_db_path}...") + agent = DVDCoreAgent(video_db_path, video_caption_path=None, max_iterations=15) + agent.messages[-1]['content'] += "\nSelect the best option that accurately addresses the question.\nAnswer with the option\'s letter from the given choices directly and only give the best option." + print("Agent initialized.") + # Run with questions + msgs = agent.parallel_run(questions, max_workers=4) + for qid, question, msg in zip(qids, questions, msgs): + answer = extract_answer(msg[-1]) + results[qid] = { + "question": question, + "answer": answer, + "reasoning": msg + } + + with open("benchmark_results.json", "w") as f: + json.dump(results, f, ensure_ascii=False, indent=2) + +if __name__ == "__main__": + main() + From 6e0247a4e30baf143e5dba5dd7135cf7c3318058 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 15 Oct 2025 14:22:34 +0000 Subject: [PATCH 14/15] update reproduce README --- README.md | 2 +- reproduce/{REPRODUCE.md => README.md} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename reproduce/{REPRODUCE.md => README.md} (100%) diff --git a/README.md b/README.md index 805df16..332dd9c 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ This repository contains the official implementation of the paper [Deep Video Di ## Update -- **2025/10/15**: Add a markdown to help to reproduce the LVBench results: [REPRODUCE.md](reproduce/REPRODUCE.md). Email me (xiaoyizhang@microsoft) if your issue get no response in 24 hours. +- **2025/10/15**: Add a markdown to help to reproduce the LVBench results: [REPRODUCE.md](reproduce/README.md). Email me (xiaoyizhang [/at/] microsoft.com) if your issue gets no response in 24 hours. - **2025/09/19**: Accepted by NeurIPS 2025 🎉 - **2025/08/04**: Upload captions on benchmarks for reproduction: [LVBench](https://1drv.ms/u/c/f029f6f5a52c17c4/ETR7ogx7YCtBgtDu66a4R14B7RKLZJoz20D4Z5I1KD6HTg?e=404kKg), [LVBench w/ transcripts](https://1drv.ms/u/c/f029f6f5a52c17c4/EcqO2lC_hRxGn-0t0IBNKZcBts3HDCEg8mZo4ltN6kXFUQ?e=XmabUn), [Video-MME](https://1drv.ms/u/c/f029f6f5a52c17c4/EVKjXQnPjeZGi-onOxEMb8UBxqI9NexKzccHuYEe8-0Lig?e=a4SxCU), [LongVideoBench](https://1drv.ms/u/c/f029f6f5a52c17c4/EQp_PABeb3ZIiysjIn-_5gEBbkhtfcBwCM1pel9xl3JHPg?e=TLpQXQ) and [EgoSchema](https://1drv.ms/u/c/f029f6f5a52c17c4/Ec0oEX3tO5pIknRdEqT9LDQB0hbS9vR9fUJaVbRfCQPJKg?e=bszgh6). - **2025/08/02**: Support auto subtitle in the demo. diff --git a/reproduce/REPRODUCE.md b/reproduce/README.md similarity index 100% rename from reproduce/REPRODUCE.md rename to reproduce/README.md From 000ac61ad38fde9d4f4e3ef2399282f037abe228 Mon Sep 17 00:00:00 2001 From: anonymous626 <131758638+anonymous626@users.noreply.github.com> Date: Mon, 3 Nov 2025 15:40:40 +0800 Subject: [PATCH 15/15] Add audio transcription scrip for reproduce This script transcribes audio files from a specified directory using the WhisperX model, aligns the output, and saves the results in JSON format. --- reproduce/transcribe.py | 53 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 reproduce/transcribe.py diff --git a/reproduce/transcribe.py b/reproduce/transcribe.py new file mode 100644 index 0000000..790f72d --- /dev/null +++ b/reproduce/transcribe.py @@ -0,0 +1,53 @@ +import json +import os +import whisperx +import gc +from whisperx.alignment import DEFAULT_ALIGN_MODELS_TORCH, DEFAULT_ALIGN_MODELS_HF + +device = "cuda" +batch_size = 64 # reduce if low on GPU mem +compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy) + +# 1. Transcribe with original whisper (batched) +model = whisperx.load_model("large-v3", device, compute_type=compute_type) +# 3. Assign speaker labels +HF_TOKEN = "YOUR_HF_TOKEN" +# diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN, device=device) + +root = "./lvbench_vdb" +for file in os.listdir(root): + # save model to local path (optional) + # model_dir = "/path/" + # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir) + + if not file.endswith(".mp3"): + continue + + audio_file = os.path.join(root, file) + + if os.path.exists(audio_file.replace(".mp3", ".json")): + print(f"File {audio_file.replace('.mp3', '.json')} already exists, skipping...") + with open(audio_file.replace(".mp3", ".json"), "r") as f: + legacy_result = json.load(f) + else: + legacy_result = None + + audio = whisperx.load_audio(audio_file) + result = model.transcribe(audio, batch_size=batch_size) + + if result["language"] in DEFAULT_ALIGN_MODELS_TORCH or \ + result["language"] in DEFAULT_ALIGN_MODELS_HF: + lang = result["language"] + else: + lang = 'en' + print(f"Language {result['language']} not supported, using English instead for {audio_file}.") + + # 2. Align whisper output + model_a, metadata = whisperx.load_align_model(language_code=lang, device=device) + result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) + + with open(audio_file.replace(".mp3", ".json"), "w") as f: + json.dump(result, f, indent=4) + print(f"saved as {audio_file.replace('.mp3', '.json')}") + +