Seed Dataset Preparation

Download Code

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "datasets",
#     "pymupdf",
#     "pandas",
#     "pyarrow",
# ]
# ///
"""Long-Document Understanding Seed Dataset Preparation

This script uses HuggingFace's FinePDFs dataset (HuggingFaceFW/finepdfs) as
an example data source to demonstrate how to prepare seed data for the rest
of the recipes. It downloads the original PDFs, renders each page to a PNG
image, and produces three seed parquet files:

  1. **per-page seed** – one row per page, with a ``png_images_base64``
     column containing a JSON array with a single base64-encoded PNG of
     that page. Suitable for single-page recipes (02 through 06).
  2. **windowed seed** – one row per window of consecutive pages, with a
     ``png_images_base64`` column containing a JSON array of base64-encoded
     PNGs for the pages in that window. The window size adapts to document
     length (2 pages for short documents up to 8 for long ones).
     Suitable for the multi-page windowed recipe (07).
  3. **whole-document seed** – one row per document, with a
     ``png_images_base64`` column containing a JSON array of base64-encoded
     PNGs for all pages. Suitable for the whole-document recipe (08).

Prerequisites:
    - Internet access to download PDFs from their original URLs.

Run:
    # Prepare seeds from 10 English PDFs (default)
    uv run 01-seed-dataset-preparation.py --output-dir ./seed_data

    # Prepare seeds from 50 PDFs
    uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --num-docs 50

    # Use a different language subset
    uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --subset fra_Latn

    # Skip documents that fail to download (default behaviour) and set
    # a custom timeout
    uv run 01-seed-dataset-preparation.py --output-dir ./seed_data --timeout 30

    # For help
    uv run 01-seed-dataset-preparation.py --help
"""

from __future__ import annotations

import base64
import json
import logging
import os
import urllib.request
from argparse import ArgumentParser
from pathlib import Path

import fitz  # pymupdf
import pandas as pd
from datasets import load_dataset

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)

logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
logging.getLogger("datasets").setLevel(logging.WARNING)
logging.getLogger("fsspec").setLevel(logging.WARNING)

DPI = 144
FINEPDFS_REPO = "HuggingFaceFW/finepdfs"


def download_pdf(url: str, timeout: int = 20) -> bytes | None:
    """Download a PDF from *url*, returning raw bytes or None on failure."""
    try:
        req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            return resp.read()
    except Exception as exc:
        log.warning("Failed to download %s: %s", url, exc)
        return None


def render_pages(pdf_bytes: bytes, dpi: int = DPI) -> list[bytes]:
    """Render every page of *pdf_bytes* to PNG, returning a list of raw PNG bytes."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    pages: list[bytes] = []
    for page in doc:
        pix = page.get_pixmap(dpi=dpi)
        pages.append(pix.tobytes("png"))
    doc.close()
    return pages


def png_to_base64(png_bytes: bytes) -> str:
    """Encode raw PNG bytes as a base64 string."""
    return base64.b64encode(png_bytes).decode("ascii")


def adaptive_window_size(n_pages: int) -> int:
    """Choose a window size that scales with document length.

    Short documents get small windows (2 pages) so multi-page questions
    remain feasible; longer documents get larger windows (up to 8) to
    cover more context per seed row.
    """
    if n_pages > 10 and n_pages < 20:
        return 3
    elif n_pages > 20 and n_pages < 30:
        return 4
    elif n_pages > 30 and n_pages < 40:
        return 5
    elif n_pages > 40 and n_pages < 50:
        return 6
    elif n_pages > 50 and n_pages < 60:
        return 7
    elif n_pages > 60:
        return 8
    return 2


def main() -> None:
    parser = ArgumentParser(description="Prepare seed parquets from FinePDFs")
    parser.add_argument(
        "--output-dir",
        type=str,
        required=True,
        help="Directory for output parquet files",
    )
    parser.add_argument(
        "--num-docs",
        type=int,
        default=10,
        help="Number of PDF documents to process (default: 10)",
    )
    parser.add_argument(
        "--subset",
        type=str,
        default="eng_Latn",
        help="FinePDFs language subset (default: eng_Latn)",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=20,
        help="HTTP download timeout in seconds (default: 20)",
    )
    parser.add_argument(
        "--dpi",
        type=int,
        default=DPI,
        help=f"Render resolution in DPI (default: {DPI})",
    )
    parser.add_argument(
        "--max-pages",
        type=int,
        default=50,
        help="Skip documents with more pages than this (default: 50)",
    )
    parser.add_argument(
        "--min-window-pages",
        type=int,
        default=2,
        help="Minimum pages in a window; documents shorter than this are skipped for windowed output (default: 2)",
    )
    args = parser.parse_args()

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    log.info(
        "Streaming %d documents from %s (subset=%s)",
        args.num_docs,
        FINEPDFS_REPO,
        args.subset,
    )

    ds = load_dataset(
        FINEPDFS_REPO,
        name=args.subset,
        split="train",
        streaming=True,
    )

    per_page_rows: list[dict] = []
    windowed_rows: list[dict] = []
    whole_doc_rows: list[dict] = []

    docs_processed = 0
    for row in ds:
        if docs_processed >= args.num_docs:
            break

        doc_id = row.get("id", f"doc_{docs_processed:06d}")
        url = row["url"]
        date = row.get("date", "")

        pdf_bytes = download_pdf(url, timeout=args.timeout)
        if pdf_bytes is None:
            continue

        try:
            page_pngs = render_pages(pdf_bytes, dpi=args.dpi)
        except Exception as exc:
            log.warning("Failed to render %s: %s", url, exc)
            continue

        if len(page_pngs) == 0:
            log.warning("No pages rendered for %s, skipping", url)
            continue

        if len(page_pngs) > args.max_pages:
            log.info(
                "Skipping %s (%d pages > --max-pages %d)",
                url,
                len(page_pngs),
                args.max_pages,
            )
            continue

        page_b64s: list[str] = []

        for page_idx, png_bytes in enumerate(page_pngs):
            b64 = png_to_base64(png_bytes)
            page_b64s.append(b64)

            per_page_rows.append(
                {
                    "id": doc_id,
                    "url": url,
                    "date": date,
                    "page_number": page_idx,
                    "total_pages": len(page_pngs),
                    "png_images_base64": json.dumps([b64]),
                }
            )

        whole_doc_rows.append(
            {
                "id": doc_id,
                "url": url,
                "date": date,
                "total_pages": len(page_pngs),
                "png_images_base64": json.dumps(page_b64s),
            }
        )

        n_pages = len(page_b64s)
        win_size = adaptive_window_size(n_pages)
        n_windows = n_pages // win_size
        for i in range(n_windows):
            win_start = i * win_size
            win_end = win_start + win_size
            if win_end - win_start < args.min_window_pages:
                break
            windowed_rows.append(
                {
                    "id": doc_id,
                    "url": url,
                    "date": date,
                    "total_pages": n_pages,
                    "start_page": win_start,
                    "end_page": win_end,
                    "window_size": win_end - win_start,
                    "png_images_base64": json.dumps(page_b64s[win_start:win_end]),
                }
            )

        docs_processed += 1
        log.info(
            "[%d/%d] %s%d pages",
            docs_processed,
            args.num_docs,
            url,
            len(page_pngs),
        )

    if not per_page_rows:
        log.error("No documents were successfully processed. Exiting.")
        return

    per_page_path = output_dir / "seed_per_page.parquet"
    windowed_path = output_dir / "seed_windowed.parquet"
    whole_doc_path = output_dir / "seed_whole_document.parquet"

    pd.DataFrame(per_page_rows).to_parquet(per_page_path, index=False)
    if windowed_rows:
        pd.DataFrame(windowed_rows).to_parquet(windowed_path, index=False)
    pd.DataFrame(whole_doc_rows).to_parquet(whole_doc_path, index=False)

    log.info("Per-page seed:       %s (%d rows)", per_page_path, len(per_page_rows))
    log.info("Windowed seed:       %s (%d rows)", windowed_path, len(windowed_rows))
    log.info("Whole-document seed: %s (%d rows)", whole_doc_path, len(whole_doc_rows))


if __name__ == "__main__":
    main()
    # Force-exit to avoid hanging on background threads from datasets/fsspec.
    os._exit(0)