#!/bin/sh
# SPDX-License-Identifier: MIT
# Generate a SELF-CONTAINED src/rust/ tree for the nirs4alldatasets R package so
# it builds OFFLINE at install time — no prebuilt nirs4all-datasets-capi cdylib,
# no external monorepo `crates/`, no network. This is the CRAN-submittable rework
# of the old C-shim-over-prebuilt-cdylib binding (which linked a separately built
# libnirs4all_datasets_capi via N4DS_INCLUDE / N4DS_CAPI_DIR). It mirrors the
# proven nirs4all-io / nirs4all-formats `*_R_VENDOR=1 ./configure` vendor pattern,
# applied to the datasets C-shim + Rust STATICLIB design (src/n4ds.c links the
# staticlib).
#
# The binding is a thin C shim: src/n4ds.c drives the stable n4ds_* C ABI using
# the COMMITTED header (src/nirs4all_datasets.h, copied here from the capi crate).
# The acquisition/integrity logic lives in the Rust core crates, compiled into a
# staticlib (libnirs4all_datasets_capi.a) that src/Makevars links into
# nirs4alldatasets.{so,dll}.
#
# Two modes:
#
#   N4DS_R_VENDOR=1  -> in-repo vendoring (forced). Copy the workspace crates the
#                       C ABI needs (nirs4all-datasets-core, nirs4all-datasets-capi)
#                       from the checkout into src/rust/vendored/, emit a
#                       self-contained src/rust/Cargo.toml whose paths point INSIDE
#                       the tarball, `cargo vendor` every crates.io transitive dep
#                       into src/rust/vendor.tar.xz, and write
#                       src/rust/.cargo/config.toml for an offline build. The source
#                       tarball ships src/rust/vendored/, src/rust/vendor.tar.xz, the
#                       generated src/rust/Cargo.toml, and src/nirs4all_datasets.h.
#
#                       The crates.io deps are shipped COMPRESSED (vendor.tar.xz,
#                       extracted by src/Makevars at build time) rather than as a
#                       raw directory because `R CMD build`'s tarball step strips
#                       VCS dotfiles from inside the vendored crates, which would
#                       break cargo's offline checksum verification. Shipping the
#                       archive byte-for-byte avoids that (same pattern the
#                       arrow / gifski / polars CRAN packages use).
#
#   (default)        -> already-vendored build. When src/rust/vendor.tar.xz and
#                       src/rust/vendored/ are already present (the installing user
#                       unpacked the CRAN source tarball and has no repo), DO NOT
#                       re-vendor: the bundled tree is used as-is. src/Makevars
#                       extracts vendor.tar.xz and `cargo build --offline` compiles
#                       the staticlib. This is the real CRAN install scenario.
#
# This script writes NO Makevars (src/Makevars / src/Makevars.win are committed,
# layout-stable, and extract vendor.tar.xz then build the staticlib against
# src/rust/ and link it into the C shim).
set -e

PKG_ROOT=$(cd "$(dirname "$0")" && pwd)
REPO=$(cd "$PKG_ROOT/../../.." && pwd)      # bindings/r/nirs4alldatasets -> repo root
RUST="$PKG_ROOT/src/rust"
VENDORED="$RUST/vendored"                   # copied workspace crates (plain dirs)
VENDOR_ARCHIVE="$RUST/vendor.tar.xz"        # compressed cargo-vendored crates.io deps
HEADER_SRC="$REPO/crates/nirs4all-datasets-capi/include/nirs4all_datasets.h"
HEADER_DST="$PKG_ROOT/src/nirs4all_datasets.h"

# The workspace crates the C ABI staticlib needs. Inspecting [dependencies]
# `path = ...` (.workspace = true resolves to these in the root manifest):
#   nirs4all-datasets-capi -> nirs4all-datasets-core
# is the full intra-workspace path closure; both are copied below.
CORE_CRATES="nirs4all-datasets-core nirs4all-datasets-capi"

VENDORED_PRESENT=0
[ -f "$VENDORED/nirs4all-datasets-capi/Cargo.toml" ] && [ -f "$VENDOR_ARCHIVE" ] && [ -f "$HEADER_DST" ] && VENDORED_PRESENT=1

if [ "${N4DS_R_VENDOR:-0}" != "1" ] && [ "$VENDORED_PRESENT" = "1" ]; then
    echo "nirs4alldatasets configure: using bundled vendored Rust tree (offline build)"
    echo "  vendored crates : $VENDORED"
    echo "  cargo vendor    : $VENDOR_ARCHIVE (extracted by src/Makevars)"
    echo "  C ABI header    : $HEADER_DST"
    exit 0
fi

if [ "${N4DS_R_VENDOR:-0}" != "1" ] && [ "$VENDORED_PRESENT" = "0" ]; then
    echo "nirs4alldatasets configure: no vendored tree and N4DS_R_VENDOR!=1 — set N4DS_R_VENDOR=1 from a full checkout to vendor (needs the monorepo crates/ + network)" >&2
    exit 1
fi

# ---------------------------------------------------------------------------
# N4DS_R_VENDOR=1: in-repo vendoring.
# ---------------------------------------------------------------------------
command -v cargo >/dev/null 2>&1 || {
    echo "nirs4alldatasets configure(vendor): ERROR — cargo not found; install Rust from https://rustup.rs" >&2
    exit 1
}
command -v python3 >/dev/null 2>&1 || {
    echo "nirs4alldatasets configure(vendor): ERROR — python3 required to edit the vendored manifests" >&2
    exit 1
}

echo "nirs4alldatasets configure: CRAN/R-universe self-contained build — vendoring the Rust core from the checkout"

# 0. Copy the committed C ABI header next to the C shim (src/n4ds.c #includes
#    "nirs4all_datasets.h"). The header is generated by cbindgen and committed in
#    the capi crate; the R build never re-runs cbindgen (we drop that build-dep and
#    its build.rs below), it pins this committed copy.
[ -f "$HEADER_SRC" ] || {
    echo "nirs4alldatasets configure(vendor): ERROR — $HEADER_SRC not found" >&2
    exit 1
}
cp "$HEADER_SRC" "$HEADER_DST"

# 1. Copy the workspace crates into src/rust/vendored/ (always fresh). Copy every
#    top-level entry EXCEPT tests/ (the [dev-dependencies] surface), target/ (build
#    artefacts), build.rs (the capi's cbindgen header-gen — dropped below), and
#    dotfiles. This captures Cargo.toml + src/ plus the capi's include/ (the
#    committed header) and abi/ (the ld version script, unused once build.rs is
#    gone). The datasets core pulls in NO include_str! data dirs. These plain dirs
#    ship in the tarball as-is.
rm -rf "$VENDORED" "$VENDOR_ARCHIVE" "$RUST/vendor" "$RUST/.cargo" "$RUST/Cargo.lock"
mkdir -p "$VENDORED"
for c in $CORE_CRATES; do
    [ -d "$REPO/crates/$c" ] || {
        echo "nirs4alldatasets configure(vendor): ERROR — $REPO/crates/$c not found" >&2
        exit 1
    }
    mkdir -p "$VENDORED/$c"
    for entry in "$REPO/crates/$c"/*; do
        name=$(basename "$entry")
        case "$name" in
            tests|target|build.rs) continue ;;
        esac
        cp -R "$entry" "$VENDORED/$c/$name"
    done
done

# 1a. The capi crate ships a build.rs whose only jobs are (a) regenerate the
#     committed header via cbindgen and (b) wire a GNU ld --version-script onto
#     the CDYLIB link. The R package builds the STATICLIB only and pins the
#     committed header, so both jobs are dead weight here. Dropping build.rs (not
#     copied above) + the cbindgen build-dependency removes the entire cbindgen/
#     clap/syn/getrandom/rustix closure from the vendor tree.
echo "nirs4alldatasets configure: dropping the cbindgen build.rs from the vendored capi (staticlib build pins the committed header)"

# 2. Strip test-only [dev-dependencies] from every vendored manifest (size:
#    `cargo vendor` collects the UNION of reachable deps, dev included — tempfile
#    pulls extra crates the R package never compiles), and drop the
#    [build-dependencies] cbindgen from the capi manifest (build.rs is gone).
for m in "$VENDORED"/*/Cargo.toml; do
    python3 - "$m" <<'PY'
import re, sys

p = sys.argv[1]
lines = open(p, encoding="utf-8").read().splitlines()

# Drop every [dev-dependencies] (incl. target-scoped) and [build-dependencies]
# table in full — dev deps are test-only; the lone build-dep (cbindgen, capi
# only) is dead now that build.rs is removed.
out, section, dropped = [], None, 0


def is_droppable(name):
    return (
        name == "dev-dependencies"
        or name.endswith(".dev-dependencies")
        or name == "build-dependencies"
        or name.endswith(".build-dependencies")
    )


for ln in lines:
    header = re.match(r"\s*\[([^\]]+)\]\s*$", ln)
    if header:
        section = header.group(1)
        if is_droppable(section):
            dropped += 1
            continue
        out.append(ln)
        continue
    if section is not None and is_droppable(section):
        continue
    out.append(ln)

open(p, "w", encoding="utf-8", newline="\n").write("\n".join(out) + "\n")
print(f"  {p}: dropped {dropped} dev/build dependency table(s)")
PY
done

# 2a. Drop the `build = ...` / build.rs reference if the capi [package] names one
#     explicitly (it does not — cargo auto-detects build.rs — but be defensive: if
#     a `build =` key exists, neutralize it so cargo does not look for the now
#     absent script).
python3 - "$VENDORED/nirs4all-datasets-capi/Cargo.toml" <<'PY'
import re, sys
p = sys.argv[1]
lines = open(p, encoding="utf-8").read().splitlines()
out = [ln for ln in lines if not re.match(r"\s*build\s*=", ln)]
open(p, "w", encoding="utf-8", newline="\n").write("\n".join(out) + "\n")
PY

# 2b. Reduce the capi [lib] crate-type to STATICLIB ONLY. The repo manifest emits
#     ["cdylib", "staticlib", "rlib"]; the R package links the .a staticlib and
#     needs neither the cdylib nor the rlib. Dropping the cdylib is LOAD-BEARING
#     on Windows: rustc only `ar`-archives a staticlib (no final link), so the
#     x86_64-pc-windows-gnu build never invokes a linker — whereas building the
#     cdylib forces a link, and on the CRAN Windows runner CARGO_LINKER is unset,
#     so src/Makevars.win's `CARGO_TARGET_..._LINKER=""` makes that link die with
#     "couldn't extract file stem from specified linker". Staticlib-only sidesteps
#     it entirely (mirrors the nirs4all-io capi + the nirs4all-formats extendr
#     crate, both ["staticlib"] only). Replace the WHOLE crate-type array
#     regardless of its spelling/spacing.
echo "nirs4alldatasets configure: reducing the vendored capi [lib] crate-type to staticlib only (R links the .a; avoids the Windows cdylib link)"
python3 - "$VENDORED/nirs4all-datasets-capi/Cargo.toml" <<'PY'
import re, sys

p = sys.argv[1]
text = open(p, encoding="utf-8").read()
new, n = re.subn(
    r'(?m)^\s*crate-type\s*=\s*\[[^\]]*\]\s*$',
    'crate-type = ["staticlib"]',
    text,
)
if n != 1:
    sys.exit(f"configure: expected exactly one crate-type in {p}, patched {n}")
open(p, "w", encoding="utf-8", newline="\n").write(new)
print(f"  {p}: crate-type -> [\"staticlib\"]")
PY

# 3. The copied crate Cargo.tomls use `.workspace = true` for version/edition/
#    license/authors/repository/homepage AND for several dependencies (serde,
#    serde_json, thiserror, sha2, hex, directories, ureq, nirs4all-datasets-core).
#    Those inherit from the MONOREPO [workspace], which is NOT in the tarball. So
#    we make src/rust/ a self-contained workspace ROOT that re-declares
#    [workspace.package] and [workspace.dependencies] with the SAME values as the
#    datasets root Cargo.toml, plus the two vendored crates as members. The copied
#    crate manifests then resolve unchanged. The staticlib emit comes from the capi
#    crate's own [lib] crate-type (reduced to ["staticlib"] in step 2b) —
#    src/Makevars builds `-p nirs4all-datasets-capi` and links
#    libnirs4all_datasets_capi.a.
# Read the [workspace.package] version with python3 (already required for this
# vendor path) — portable across GNU and BSD/macOS, unlike a `sed` address-range
# substitute.
WS_VERSION=$(python3 - "$REPO/Cargo.toml" <<'PY'
import re, sys
text = open(sys.argv[1], encoding="utf-8").read()
# Find the [workspace.package] table and the first `version = "..."` inside it.
m = re.search(r'(?ms)^\[workspace\.package\]\s*$(.*?)(^\[|\Z)', text)
section = m.group(1) if m else ""
v = re.search(r'(?m)^\s*version\s*=\s*"([^"]+)"', section)
sys.stdout.write(v.group(1) if v else "")
PY
)
[ -n "$WS_VERSION" ] || { echo "nirs4alldatasets configure(vendor): ERROR — could not read workspace version from $REPO/Cargo.toml" >&2; exit 1; }
cat > "$RUST/Cargo.toml" <<TOML
# SPDX-License-Identifier: MIT
# GENERATED by ./configure (N4DS_R_VENDOR=1) — self-contained vendored workspace.
#
# This is the workspace ROOT for the two vendored datasets crates. They inherit
# version/edition/license/deps from [workspace.package] / [workspace.dependencies]
# below, which MIRROR the monorepo root Cargo.toml (the source tarball has no
# monorepo). src/Makevars builds \`-p nirs4all-datasets-capi --offline\` to produce
# the staticlib (libnirs4all_datasets_capi.a), which the C shim (src/n4ds.c) links.
[workspace]
resolver = "2"
members = [
    "vendored/nirs4all-datasets-core",
    "vendored/nirs4all-datasets-capi",
]

[workspace.package]
version = "${WS_VERSION}"
edition = "2021"
license = "MIT"
authors = ["Gregory Beurier <gregory.beurier@cirad.fr>"]
repository = "https://github.com/GBeurier/nirs4all-datasets"
homepage = "https://github.com/GBeurier/nirs4all-datasets"

[workspace.dependencies]
nirs4all-datasets-core = { path = "vendored/nirs4all-datasets-core", version = "${WS_VERSION}" }
serde = { version = "1.0", features = ["derive"] }
serde_json = { version = "1.0", features = ["preserve_order", "float_roundtrip"] }
thiserror = "2.0"
sha2 = "0.10"
hex = "0.4"
directories = "5"
ureq = "2"

# Bound Cargo's release profile to a modest, reproducible setting (mirrors the
# monorepo root: opt-level 2 + strip debuginfo).
[profile.release]
opt-level = 2
strip = "debuginfo"
TOML

# 4. cargo vendor every crates.io transitive dep, then compress it. The crates.io
#    source replacement is written to .cargo/config.toml; src/Makevars passes it
#    INLINE via --config so the package ships NO hidden .cargo/ dir (which would
#    trip R CMD check's "hidden files" NOTE) — but we still write it here so a
#    manual `cd src/rust && cargo build --offline` works for debugging.
echo "nirs4alldatasets configure: cargo vendor (crates.io transitive deps)"
# `cargo vendor` reads the workspace at $RUST and collects the union of every
# reachable crates.io dep for the default (net) feature. The datasets closure is
# pure-Rust (ureq/rustls/ring; no Arrow/Parquet/HDF5), so no --target restriction
# is needed (and this cargo's `vendor` has no --target flag anyway).
( cd "$RUST" && cargo vendor --versioned-dirs vendor ) >/dev/null

# 4a. Strip CRAN-tripping stray files from the vendored crates — two classes:
#
#       * stray CITATION.cff / CITATION (R CMD check --as-cran NOTEs any CITATION
#         outside inst/: "CITATION file in a non-standard place");
#       * GNU-extension Makefiles bundled inside C-library crates that cargo
#         compiles via their own build.rs and NEVER invokes (ring vendors a
#         libcrypto-style build system; R CMD check's "GNU make extensions"
#         WARNING scans every Makefile/Makevars/GNUmakefile in the unpacked+
#         installed tree, including the extracted vendor/, so they must go).
#
#     Removing a file alone would break cargo's OFFLINE checksum verification, so
#     we also drop its entry from the crate's `.cargo-checksum.json` (cargo only
#     verifies the files it lists). Walk recursively: the offending Makefiles are
#     nested several levels deep inside the C-source subtrees. python3 is the
#     vendoring host's JSON editor; it is only needed when vendoring (CRAN's
#     install never reaches this branch — it unpacks the already-stripped tree).
echo "nirs4alldatasets configure: stripping stray CITATION + GNU-make Makefiles from vendored crates"
python3 - "$RUST/vendor" <<'PY'
import json, os, re, sys

root = sys.argv[1]

# A vendored file is dropped if its BASENAME is a stray CITATION or any
# Make-family file: Makefile / GNUmakefile / Makevars, the autotools
# Makefile.am / Makefile.in / Makefile.inc inputs, and the cargo-make
# Makefile.toml task config. None are used by the R build (every C-library crate
# compiles via its own build.rs/`cc`); they only exist to trip R CMD check's
# "GNU make extensions" scan of the unpacked+installed tree. Matching the basename
# covers the deep-nested copies because .cargo-checksum.json keys them by
# crate-relative path. NB: a real R-package Makefile.win is NOT in the vendor tree,
# so it is never affected.
CITATION = {"CITATION.cff", "CITATION"}
MAKE_RE = re.compile(r"^(GNU)?[Mm]ake(file|vars)(\.(am|in|inc|toml))?$")


def is_stray(basename):
    return basename in CITATION or bool(MAKE_RE.match(basename))


cit_removed = mk_removed = 0
for name in sorted(os.listdir(root)):
    d = os.path.join(root, name)
    cj = os.path.join(d, ".cargo-checksum.json")
    if not os.path.isdir(d) or not os.path.exists(cj):
        continue
    with open(cj, encoding="utf-8") as fh:
        meta = json.load(fh)
    files = meta.get("files", {})
    changed = False
    for fname in list(files):
        base = os.path.basename(fname)
        if not is_stray(base):
            continue
        fp = os.path.join(d, fname)
        if os.path.exists(fp):
            os.remove(fp)
        del files[fname]
        changed = True
        if base in CITATION:
            cit_removed += 1
        else:
            mk_removed += 1
    if changed:
        with open(cj, "w", encoding="utf-8", newline="\n") as fh:
            json.dump(meta, fh)
print(f"  stripped {cit_removed} CITATION + {mk_removed} GNU-make file(s) from the vendored tree")
PY

# 4b. Strip the prebuilt Windows import-lib payloads (windows_<arch>_<env>/lib/*.a
#     and *.lib) from every windows_* crate the R build never LINKS. `cargo vendor`
#     collects the whole resolution graph (every windows_<arch>_<env> for the
#     windows-sys/windows-targets closure pulled in by ureq/rustls/getrandom/
#     directories), and each of those crates ships a 4–13 MB prebuilt import-lib
#     blob — ~76 MB total, by far the heaviest part of the vendor tree. But the R
#     package only ever builds two triples: the host (Linux/macOS, where NO windows
#     crate compiles) and x86_64-pc-windows-gnu (where ONLY windows_x86_64_gnu is
#     linked). Every other arch/env import lib (msvc / gnullvm / i686 / aarch64) is
#     pulled into the lockfile but never extracted or linked for either target, so
#     its binary payload is pure tarball bloat. windows_x86_64_gnu is kept intact
#     (BOTH the 0.48.x and 0.52.x majors, since the exact one the gnu target
#     resolves can't be known without a Windows build). Each removed lib/ file's
#     entry is dropped from .cargo-checksum.json so cargo's OFFLINE checksum
#     verification (which only checks the files it lists, and only for crates it
#     actually extracts) stays consistent — exactly as for the Makefile prune above.
#     Verified: the host offline staticlib build is byte-identical after this prune.
echo "nirs4alldatasets configure: stripping never-linked Windows import-lib payloads from the vendored tree"
python3 - "$RUST/vendor" <<'PY'
import json, os, sys

root = sys.argv[1]
removed_bytes = removed_files = 0
for name in sorted(os.listdir(root)):
    if not name.startswith("windows_"):
        continue
    # windows_x86_64_gnu is the ONLY windows import-lib crate linked on the
    # x86_64-pc-windows-gnu target the R package builds; keep it (both majors).
    if name.startswith("windows_x86_64_gnu-"):
        continue
    d = os.path.join(root, name)
    cj = os.path.join(d, ".cargo-checksum.json")
    if not os.path.isdir(os.path.join(d, "lib")) or not os.path.exists(cj):
        continue
    with open(cj, encoding="utf-8") as fh:
        meta = json.load(fh)
    files = meta.get("files", {})
    changed = False
    for fname in list(files):
        if not fname.startswith("lib/"):
            continue
        fp = os.path.join(d, fname)
        if os.path.exists(fp):
            removed_bytes += os.path.getsize(fp)
            os.remove(fp)
            removed_files += 1
        del files[fname]
        changed = True
    if changed:
        with open(cj, "w", encoding="utf-8", newline="\n") as fh:
            json.dump(meta, fh)
print(f"  stripped {removed_files} never-linked Windows import-lib(s), {removed_bytes / 1e6:.1f} MB")
PY

mkdir -p "$RUST/.cargo"
cat > "$RUST/.cargo/config.toml" <<'TOML'
# Offline source replacement for the vendored crates.io deps (debugging aid;
# src/Makevars passes the same replacement INLINE via --config so the installed
# package ships no hidden .cargo/ dir).
[source.crates-io]
replace-with = "vendored-sources"

[source.vendored-sources]
directory = "vendor"
TOML

echo "nirs4alldatasets configure: compressing vendor/ -> vendor.tar.xz (avoids R's tarball dotfile stripping)"
( cd "$RUST" && tar cJf vendor.tar.xz vendor && rm -rf vendor )

echo "nirs4alldatasets configure: self-contained Rust tree ready"
echo "  vendored crates : $(ls "$VENDORED" | tr '\n' ' ')"
echo "  vendor archive  : $VENDOR_ARCHIVE ($(du -h "$VENDOR_ARCHIVE" | cut -f1))"
echo "  C ABI header    : $HEADER_DST"
echo "  source replace  : passed INLINE to cargo by src/Makevars (--config), no .cargo/ dir in the tarball"
