download_vpt
Bulk downloader for the OpenAI VPT Minecraft dataset.
Reads a VPT JSON index file ({"basedir": ..., "relpaths": [...]}) and
downloads every .mp4 recording plus its matching .jsonl action log
into a target directory. Uses aria2c when available for multi-segment
parallel downloads; falls back to a warning otherwise.
Typical usage:
python download_vpt.py --input_path 8xx_Jun_29.json \ --output_path ./data/vpt-recordings --num_downloads 100 --workers 8
1"""Bulk downloader for the OpenAI VPT Minecraft dataset. 2 3Reads a VPT JSON index file (``{"basedir": ..., "relpaths": [...]}``) and 4downloads every ``.mp4`` recording plus its matching ``.jsonl`` action log 5into a target directory. Uses ``aria2c`` when available for multi-segment 6parallel downloads; falls back to a warning otherwise. 7 8Typical usage: 9 python download_vpt.py --input_path 8xx_Jun_29.json \\ 10 --output_path ./data/vpt-recordings --num_downloads 100 --workers 8 11""" 12import os 13import json 14import shutil 15import argparse 16import subprocess 17from pathlib import Path 18 19 20def build_urls(filename, num_downloads): 21 """Build the list of URLs to fetch from a VPT index JSON. 22 23 Args: 24 filename: Path to a VPT index file. Must contain ``basedir`` (a 25 URL prefix) and ``relpaths`` (a list of relative paths). 26 num_downloads: Number of ``(mp4, jsonl)`` pairs to take from the 27 front of ``relpaths``. Pass ``0`` / falsy for "all". 28 29 Returns: 30 list[str]: URLs in ``[mp4, jsonl, mp4, jsonl, ...]`` order — 31 twice ``num_downloads`` entries when truncated. 32 """ 33 with open(filename, "r", encoding="utf-8") as f: 34 data = json.load(f) 35 paths = data["relpaths"][:num_downloads] if num_downloads else data["relpaths"] 36 urls = [] 37 for path in paths: 38 mp4 = data["basedir"] + path 39 urls.extend([mp4, mp4[:-3] + "jsonl"]) 40 return urls 41 42 43def download_aria2c(urls, output, workers): 44 """Fetch ``urls`` into ``output`` using aria2c. 45 46 Writes a temporary ``_urls.txt`` manifest (one URL per entry with a 47 per-entry ``dir=`` override) and invokes ``aria2c`` with ``--async-dns=false`` 48 so that the system resolver is used — this fixes intermittent DNS 49 resolution failures against the VPT CDN seen on university networks. 50 51 Args: 52 urls: List of URLs to download. 53 output: Target directory (created by the caller). 54 workers: Max concurrent downloads. aria2c also splits each file 55 into 8 connections internally. 56 57 Returns: 58 None. Prints a warning when aria2c exits non-zero so the caller 59 can re-run to resume. 60 """ 61 url_file = Path(output) / "_urls.txt" 62 with open(url_file, "w", encoding="utf-8") as f: 63 for url in urls: 64 f.write(url + f"\n dir={output}\n") 65 66 cmd = [ 67 "aria2c", 68 f"--input-file={url_file}", 69 f"--max-concurrent-downloads={workers}", 70 "--split=8", 71 "--max-connection-per-server=8", 72 "--min-split-size=5M", 73 "--continue=true", 74 "--max-tries=5", 75 "--retry-wait=3", 76 "--async-dns=false", # use system DNS — fixes resolution failures 77 "--console-log-level=warn", 78 "--summary-interval=0", 79 ] 80 81 print(f"[aria2c] Downloading {len(urls)} files " 82 f"({workers} concurrent, 8 segments/file)...") 83 result = subprocess.run(cmd, check=False) 84 url_file.unlink(missing_ok=True) 85 86 if result.returncode != 0: 87 print(f"[aria2c] Finished with exit code {result.returncode}. " 88 "Some downloads may have failed — re-run to resume.") 89 90 91def main(): 92 """CLI entry point: parse arguments and dispatch to aria2c. 93 94 Parses ``--input_path``, ``--output_path``, ``--num_downloads`` and 95 ``--workers``, creates the output directory, builds the URL list 96 via :func:`build_urls`, and hands it to :func:`download_aria2c` 97 when ``aria2c`` is on ``PATH``. Prints an install hint otherwise. 98 """ 99 p = argparse.ArgumentParser(description="Download VPT Minecraft data") 100 p.add_argument("--input_path", type=str, required=True, 101 help="Path to the JSON index file") 102 p.add_argument("--output_path", type=str, required=True, 103 help="Directory to save downloaded files") 104 p.add_argument("--num_downloads", type=int, default=0, 105 help="Number of mp4/jsonl pairs (0 = all)") 106 p.add_argument("--workers", type=int, default=8, 107 help="Parallel download slots (default: 8)") 108 args = p.parse_args() 109 110 os.makedirs(args.output_path, exist_ok=True) 111 urls = build_urls(args.input_path, args.num_downloads) 112 print(f"Preparing to download {len(urls)} files into '{args.output_path}'...") 113 114 if shutil.which("aria2c"): 115 download_aria2c(urls, args.output_path, args.workers) 116 print("Done.") 117 else: 118 print("[info] aria2c not found. Install aria2c for faster multi-segment downloads.") 119 120 121if __name__ == "__main__": 122 main()
21def build_urls(filename, num_downloads): 22 """Build the list of URLs to fetch from a VPT index JSON. 23 24 Args: 25 filename: Path to a VPT index file. Must contain ``basedir`` (a 26 URL prefix) and ``relpaths`` (a list of relative paths). 27 num_downloads: Number of ``(mp4, jsonl)`` pairs to take from the 28 front of ``relpaths``. Pass ``0`` / falsy for "all". 29 30 Returns: 31 list[str]: URLs in ``[mp4, jsonl, mp4, jsonl, ...]`` order — 32 twice ``num_downloads`` entries when truncated. 33 """ 34 with open(filename, "r", encoding="utf-8") as f: 35 data = json.load(f) 36 paths = data["relpaths"][:num_downloads] if num_downloads else data["relpaths"] 37 urls = [] 38 for path in paths: 39 mp4 = data["basedir"] + path 40 urls.extend([mp4, mp4[:-3] + "jsonl"]) 41 return urls
Build the list of URLs to fetch from a VPT index JSON.
Arguments:
- filename: Path to a VPT index file. Must contain
basedir(a URL prefix) andrelpaths(a list of relative paths). - num_downloads: Number of
(mp4, jsonl)pairs to take from the front ofrelpaths. Pass0/ falsy for "all".
Returns:
list[str]: URLs in
[mp4, jsonl, mp4, jsonl, ...]order — twicenum_downloadsentries when truncated.
44def download_aria2c(urls, output, workers): 45 """Fetch ``urls`` into ``output`` using aria2c. 46 47 Writes a temporary ``_urls.txt`` manifest (one URL per entry with a 48 per-entry ``dir=`` override) and invokes ``aria2c`` with ``--async-dns=false`` 49 so that the system resolver is used — this fixes intermittent DNS 50 resolution failures against the VPT CDN seen on university networks. 51 52 Args: 53 urls: List of URLs to download. 54 output: Target directory (created by the caller). 55 workers: Max concurrent downloads. aria2c also splits each file 56 into 8 connections internally. 57 58 Returns: 59 None. Prints a warning when aria2c exits non-zero so the caller 60 can re-run to resume. 61 """ 62 url_file = Path(output) / "_urls.txt" 63 with open(url_file, "w", encoding="utf-8") as f: 64 for url in urls: 65 f.write(url + f"\n dir={output}\n") 66 67 cmd = [ 68 "aria2c", 69 f"--input-file={url_file}", 70 f"--max-concurrent-downloads={workers}", 71 "--split=8", 72 "--max-connection-per-server=8", 73 "--min-split-size=5M", 74 "--continue=true", 75 "--max-tries=5", 76 "--retry-wait=3", 77 "--async-dns=false", # use system DNS — fixes resolution failures 78 "--console-log-level=warn", 79 "--summary-interval=0", 80 ] 81 82 print(f"[aria2c] Downloading {len(urls)} files " 83 f"({workers} concurrent, 8 segments/file)...") 84 result = subprocess.run(cmd, check=False) 85 url_file.unlink(missing_ok=True) 86 87 if result.returncode != 0: 88 print(f"[aria2c] Finished with exit code {result.returncode}. " 89 "Some downloads may have failed — re-run to resume.")
Fetch urls into output using aria2c.
Writes a temporary _urls.txt manifest (one URL per entry with a
per-entry dir= override) and invokes aria2c with --async-dns=false
so that the system resolver is used — this fixes intermittent DNS
resolution failures against the VPT CDN seen on university networks.
Arguments:
- urls: List of URLs to download.
- output: Target directory (created by the caller).
- workers: Max concurrent downloads. aria2c also splits each file into 8 connections internally.
Returns:
None. Prints a warning when aria2c exits non-zero so the caller can re-run to resume.
92def main(): 93 """CLI entry point: parse arguments and dispatch to aria2c. 94 95 Parses ``--input_path``, ``--output_path``, ``--num_downloads`` and 96 ``--workers``, creates the output directory, builds the URL list 97 via :func:`build_urls`, and hands it to :func:`download_aria2c` 98 when ``aria2c`` is on ``PATH``. Prints an install hint otherwise. 99 """ 100 p = argparse.ArgumentParser(description="Download VPT Minecraft data") 101 p.add_argument("--input_path", type=str, required=True, 102 help="Path to the JSON index file") 103 p.add_argument("--output_path", type=str, required=True, 104 help="Directory to save downloaded files") 105 p.add_argument("--num_downloads", type=int, default=0, 106 help="Number of mp4/jsonl pairs (0 = all)") 107 p.add_argument("--workers", type=int, default=8, 108 help="Parallel download slots (default: 8)") 109 args = p.parse_args() 110 111 os.makedirs(args.output_path, exist_ok=True) 112 urls = build_urls(args.input_path, args.num_downloads) 113 print(f"Preparing to download {len(urls)} files into '{args.output_path}'...") 114 115 if shutil.which("aria2c"): 116 download_aria2c(urls, args.output_path, args.workers) 117 print("Done.") 118 else: 119 print("[info] aria2c not found. Install aria2c for faster multi-segment downloads.")
CLI entry point: parse arguments and dispatch to aria2c.
Parses --input_path, --output_path, --num_downloads and
--workers, creates the output directory, builds the URL list
via build_urls(), and hands it to download_aria2c()
when aria2c is on PATH. Prints an install hint otherwise.