download_vpt

Bulk downloader for the OpenAI VPT Minecraft dataset.

Reads a VPT JSON index file ({"basedir": ..., "relpaths": [...]}) and downloads every .mp4 recording plus its matching .jsonl action log into a target directory. Uses aria2c when available for multi-segment parallel downloads; falls back to a warning otherwise.

Typical usage:

python download_vpt.py --input_path 8xx_Jun_29.json \ --output_path ./data/vpt-recordings --num_downloads 100 --workers 8

  1"""Bulk downloader for the OpenAI VPT Minecraft dataset.
  2
  3Reads a VPT JSON index file (``{"basedir": ..., "relpaths": [...]}``) and
  4downloads every ``.mp4`` recording plus its matching ``.jsonl`` action log
  5into a target directory. Uses ``aria2c`` when available for multi-segment
  6parallel downloads; falls back to a warning otherwise.
  7
  8Typical usage:
  9    python download_vpt.py --input_path 8xx_Jun_29.json \\
 10        --output_path ./data/vpt-recordings --num_downloads 100 --workers 8
 11"""
 12import os
 13import json
 14import shutil
 15import argparse
 16import subprocess
 17from pathlib import Path
 18
 19
 20def build_urls(filename, num_downloads):
 21    """Build the list of URLs to fetch from a VPT index JSON.
 22
 23    Args:
 24        filename: Path to a VPT index file. Must contain ``basedir`` (a
 25            URL prefix) and ``relpaths`` (a list of relative paths).
 26        num_downloads: Number of ``(mp4, jsonl)`` pairs to take from the
 27            front of ``relpaths``. Pass ``0`` / falsy for "all".
 28
 29    Returns:
 30        list[str]: URLs in ``[mp4, jsonl, mp4, jsonl, ...]`` order —
 31        twice ``num_downloads`` entries when truncated.
 32    """
 33    with open(filename, "r", encoding="utf-8") as f:
 34        data = json.load(f)
 35    paths = data["relpaths"][:num_downloads] if num_downloads else data["relpaths"]
 36    urls = []
 37    for path in paths:
 38        mp4 = data["basedir"] + path
 39        urls.extend([mp4, mp4[:-3] + "jsonl"])
 40    return urls
 41
 42
 43def download_aria2c(urls, output, workers):
 44    """Fetch ``urls`` into ``output`` using aria2c.
 45
 46    Writes a temporary ``_urls.txt`` manifest (one URL per entry with a
 47    per-entry ``dir=`` override) and invokes ``aria2c`` with ``--async-dns=false``
 48    so that the system resolver is used — this fixes intermittent DNS
 49    resolution failures against the VPT CDN seen on university networks.
 50
 51    Args:
 52        urls: List of URLs to download.
 53        output: Target directory (created by the caller).
 54        workers: Max concurrent downloads. aria2c also splits each file
 55            into 8 connections internally.
 56
 57    Returns:
 58        None. Prints a warning when aria2c exits non-zero so the caller
 59        can re-run to resume.
 60    """
 61    url_file = Path(output) / "_urls.txt"
 62    with open(url_file, "w", encoding="utf-8") as f:
 63        for url in urls:
 64            f.write(url + f"\n  dir={output}\n")
 65
 66    cmd = [
 67        "aria2c",
 68        f"--input-file={url_file}",
 69        f"--max-concurrent-downloads={workers}",
 70        "--split=8",
 71        "--max-connection-per-server=8",
 72        "--min-split-size=5M",
 73        "--continue=true",
 74        "--max-tries=5",
 75        "--retry-wait=3",
 76        "--async-dns=false",  # use system DNS — fixes resolution failures
 77        "--console-log-level=warn",
 78        "--summary-interval=0",
 79    ]
 80
 81    print(f"[aria2c] Downloading {len(urls)} files "
 82          f"({workers} concurrent, 8 segments/file)...")
 83    result = subprocess.run(cmd, check=False)
 84    url_file.unlink(missing_ok=True)
 85
 86    if result.returncode != 0:
 87        print(f"[aria2c] Finished with exit code {result.returncode}. "
 88              "Some downloads may have failed — re-run to resume.")
 89
 90
 91def main():
 92    """CLI entry point: parse arguments and dispatch to aria2c.
 93
 94    Parses ``--input_path``, ``--output_path``, ``--num_downloads`` and
 95    ``--workers``, creates the output directory, builds the URL list
 96    via :func:`build_urls`, and hands it to :func:`download_aria2c`
 97    when ``aria2c`` is on ``PATH``. Prints an install hint otherwise.
 98    """
 99    p = argparse.ArgumentParser(description="Download VPT Minecraft data")
100    p.add_argument("--input_path",    type=str, required=True,
101                   help="Path to the JSON index file")
102    p.add_argument("--output_path",   type=str, required=True,
103                   help="Directory to save downloaded files")
104    p.add_argument("--num_downloads", type=int, default=0,
105                   help="Number of mp4/jsonl pairs (0 = all)")
106    p.add_argument("--workers",       type=int, default=8,
107                   help="Parallel download slots (default: 8)")
108    args = p.parse_args()
109
110    os.makedirs(args.output_path, exist_ok=True)
111    urls = build_urls(args.input_path, args.num_downloads)
112    print(f"Preparing to download {len(urls)} files into '{args.output_path}'...")
113
114    if shutil.which("aria2c"):
115        download_aria2c(urls, args.output_path, args.workers)
116        print("Done.")
117    else:
118        print("[info] aria2c not found. Install aria2c for faster multi-segment downloads.")
119
120
121if __name__ == "__main__":
122    main()
def build_urls(filename, num_downloads):
21def build_urls(filename, num_downloads):
22    """Build the list of URLs to fetch from a VPT index JSON.
23
24    Args:
25        filename: Path to a VPT index file. Must contain ``basedir`` (a
26            URL prefix) and ``relpaths`` (a list of relative paths).
27        num_downloads: Number of ``(mp4, jsonl)`` pairs to take from the
28            front of ``relpaths``. Pass ``0`` / falsy for "all".
29
30    Returns:
31        list[str]: URLs in ``[mp4, jsonl, mp4, jsonl, ...]`` order —
32        twice ``num_downloads`` entries when truncated.
33    """
34    with open(filename, "r", encoding="utf-8") as f:
35        data = json.load(f)
36    paths = data["relpaths"][:num_downloads] if num_downloads else data["relpaths"]
37    urls = []
38    for path in paths:
39        mp4 = data["basedir"] + path
40        urls.extend([mp4, mp4[:-3] + "jsonl"])
41    return urls

Build the list of URLs to fetch from a VPT index JSON.

Arguments:
  • filename: Path to a VPT index file. Must contain basedir (a URL prefix) and relpaths (a list of relative paths).
  • num_downloads: Number of (mp4, jsonl) pairs to take from the front of relpaths. Pass 0 / falsy for "all".
Returns:

list[str]: URLs in [mp4, jsonl, mp4, jsonl, ...] order — twice num_downloads entries when truncated.

def download_aria2c(urls, output, workers):
44def download_aria2c(urls, output, workers):
45    """Fetch ``urls`` into ``output`` using aria2c.
46
47    Writes a temporary ``_urls.txt`` manifest (one URL per entry with a
48    per-entry ``dir=`` override) and invokes ``aria2c`` with ``--async-dns=false``
49    so that the system resolver is used — this fixes intermittent DNS
50    resolution failures against the VPT CDN seen on university networks.
51
52    Args:
53        urls: List of URLs to download.
54        output: Target directory (created by the caller).
55        workers: Max concurrent downloads. aria2c also splits each file
56            into 8 connections internally.
57
58    Returns:
59        None. Prints a warning when aria2c exits non-zero so the caller
60        can re-run to resume.
61    """
62    url_file = Path(output) / "_urls.txt"
63    with open(url_file, "w", encoding="utf-8") as f:
64        for url in urls:
65            f.write(url + f"\n  dir={output}\n")
66
67    cmd = [
68        "aria2c",
69        f"--input-file={url_file}",
70        f"--max-concurrent-downloads={workers}",
71        "--split=8",
72        "--max-connection-per-server=8",
73        "--min-split-size=5M",
74        "--continue=true",
75        "--max-tries=5",
76        "--retry-wait=3",
77        "--async-dns=false",  # use system DNS — fixes resolution failures
78        "--console-log-level=warn",
79        "--summary-interval=0",
80    ]
81
82    print(f"[aria2c] Downloading {len(urls)} files "
83          f"({workers} concurrent, 8 segments/file)...")
84    result = subprocess.run(cmd, check=False)
85    url_file.unlink(missing_ok=True)
86
87    if result.returncode != 0:
88        print(f"[aria2c] Finished with exit code {result.returncode}. "
89              "Some downloads may have failed — re-run to resume.")

Fetch urls into output using aria2c.

Writes a temporary _urls.txt manifest (one URL per entry with a per-entry dir= override) and invokes aria2c with --async-dns=false so that the system resolver is used — this fixes intermittent DNS resolution failures against the VPT CDN seen on university networks.

Arguments:
  • urls: List of URLs to download.
  • output: Target directory (created by the caller).
  • workers: Max concurrent downloads. aria2c also splits each file into 8 connections internally.
Returns:

None. Prints a warning when aria2c exits non-zero so the caller can re-run to resume.

def main():
 92def main():
 93    """CLI entry point: parse arguments and dispatch to aria2c.
 94
 95    Parses ``--input_path``, ``--output_path``, ``--num_downloads`` and
 96    ``--workers``, creates the output directory, builds the URL list
 97    via :func:`build_urls`, and hands it to :func:`download_aria2c`
 98    when ``aria2c`` is on ``PATH``. Prints an install hint otherwise.
 99    """
100    p = argparse.ArgumentParser(description="Download VPT Minecraft data")
101    p.add_argument("--input_path",    type=str, required=True,
102                   help="Path to the JSON index file")
103    p.add_argument("--output_path",   type=str, required=True,
104                   help="Directory to save downloaded files")
105    p.add_argument("--num_downloads", type=int, default=0,
106                   help="Number of mp4/jsonl pairs (0 = all)")
107    p.add_argument("--workers",       type=int, default=8,
108                   help="Parallel download slots (default: 8)")
109    args = p.parse_args()
110
111    os.makedirs(args.output_path, exist_ok=True)
112    urls = build_urls(args.input_path, args.num_downloads)
113    print(f"Preparing to download {len(urls)} files into '{args.output_path}'...")
114
115    if shutil.which("aria2c"):
116        download_aria2c(urls, args.output_path, args.workers)
117        print("Done.")
118    else:
119        print("[info] aria2c not found. Install aria2c for faster multi-segment downloads.")

CLI entry point: parse arguments and dispatch to aria2c.

Parses --input_path, --output_path, --num_downloads and --workers, creates the output directory, builds the URL list via build_urls(), and hands it to download_aria2c() when aria2c is on PATH. Prints an install hint otherwise.