Initial commit: Speckle-Scanner 3D pipeline with setup README

2026-06-10 03:09:05 +05:00
commit 1765934846
375 changed files with 123081 additions and 0 deletions
@@ -0,0 +1,45 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.egg-info/
 .eggs/
 dist/
 build/
 *.egg
 .venv/
 venv/
 env/
 .env
 *.log
 # IDE / editor
 .cursor/
 .vscode/
 .idea/
 *.swp
 *.swo
 *~
 .DS_Store
 # libSGM — compiled locally on each machine (see README setup)
 05_disparity/libsgm/build/
 # Optional local test output
 06_Pointcloud/output/
 **/output/
 # Data directories — live OUTSIDE this repo under ~/
 # (listed here in case someone copies data into the clone by mistake)
 3D-Scans/
 Calib-data/
 Speckle-Scanner_Processing_data/
 # Large / generated artifacts
 *.o
 *.so
 *.a
 *.ply
 *.bmp
 *.npy
 !05_disparity/libsgm/sample/**
@@ -0,0 +1,408 @@
 # 02 Calibration
 Two-step calibration pipeline:
 | Step | Script | What it does |
 |------|--------|--------------|
 | **1. Detection** | `detect_features.py` | Chessboard corners / IR ellipses → **JSON next to each image** |
 | **2. Calibration** | `calibrate.py` | Mono intrinsics per camera + stereo **lc vs rc/rg/ir** |
 `main.py` runs both steps by default (`--step all`).
 ---
 ## Troubleshooting flag
 All calibration scripts accept `--troubleshooting` (default: **off**).
 | `--troubleshooting` | Logs | Disk output |
 |---------------------|------|-------------|
 | **False** (default) | Minimal summary per camera / stereo pair | Step 1: `*.json` only (required for step 2). Step 2: **`params/` only** |
 | **True** | Detailed per-image / per-pair logs, progress bars | Step 1: + `corners/<camera>/` overlays. Step 2: + `pairing_reports/`, `rectified/` |
 ```bash
 # Default — minimal logs, only params/ from step 2
 python main.py --project Olsen_wings --date 2026-05-12 --calib_name calib1
 # Debug — verbose logs + intermediate folders
 python main.py --project Olsen_wings --date 2026-05-12 --calib_name calib1 --troubleshooting
 ```
 Legacy mode (`--legacy`) also respects `--troubleshooting` (corners, local_coords, images_ncb, rectified).
 ---
 ## All CLI parameters (reference)
 | Parameter | Default | Used in |
 |-----------|---------|---------|
 | `--project` | required | all |
 | `--date` | required | all |
 | `--calib_name` | `calib1` | all |
 | `--chessboard_size` | `8,7` | all |
 | `--square_size` | `0.045` | all |
 | `--left_chessboard_size` | = `--chessboard_size` | all |
 | `--right_chessboard_size` | = `--chessboard_size` | all |
 | `--left_square_size` | = `--square_size` | all |
 | `--right_square_size` | = `--square_size` | all |
 | `--preprocessing` | `None` | step 1 (`G`, `C`, `T` chain) |
 | `--cameras` | all present | `detect_features.py` |
 | `--ir_mode` | `auto` | step 1 (`auto` / `chessboard` / `ellipse`) |
 | `--step` | `all` | `main.py` (`detect`/`calibrate`/`all`); `calibrate.py` (`mono`/`stereo`/`all`) |
 | `--left_camera` | `lc` | step 2 stereo (`lc` / `lc-ir`) |
 | `--time_window` | `0.1` | step 2 stereo (seconds) |
 | `--partners` | `rc,rg,ir` | step 2 stereo |
 | `--legacy` | off | `main.py` only |
 | `--right_camera` | `rc` | `main.py --legacy` only |
 | `--troubleshooting` | off | all (`False` = minimal; `True` = debug output) |
 ---
 ## Folder structure
 ```
 ~/Calib-data/<project>/<date>/<calib_name>/
 ├── lc/
 │   ├── lc_1778599872850705.bmp
 │   └── lc_1778599872850705.json      ← step 1 (always)
 ├── rc/
 ├── rg/   (or rgb/)
 ├── ir/   (or IR/)
 ├── corners/                          ← step 1, only with --troubleshooting
 ├── pairing_reports/                  ← step 2, only with --troubleshooting
 ├── rectified/                        ← step 2, only with --troubleshooting
 └── params/                           ← step 2 (always)
    ├── lc_intrinsics.npz
    ├── rc_intrinsics.npz
    ├── lc-rc_parameters.npz
    ├── lc-rc_stereo_cam_model.yaml
    ├── lc-rc_Q.cvstore
    ├── lc-rg_*
    └── lc-ir_*
 ```
 Nested layout (`<calib_name>/images/lc/`, …) is also supported.
 ---
 ## Quick start (full pipeline)
 ```bash
 cd ~/Speckle-Scanner/02_Calibration
 python main.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --chessboard_size 8,7 --square_size 0.045
 ```
 Or run steps separately:
 ```bash
 # Step 1 — detect features, write JSON
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --chessboard_size 8,7 --square_size 0.045
 # Step 2 — calibrate from JSON (writes params/ only)
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --chessboard_size 8,7 --square_size 0.045 \
  --time_window 0.1
 ```
 ---
 ## Step 1 — Feature detection (per camera)
 For every image in each camera folder (`lc`, `rc`, `rg`, `ir`, `lc-ir`):
 - Detects **chessboard corners** (default for lc/rc/rg)
 - For **IR**: tries chessboard first (`--ir_mode auto`), falls back to **ellipse center**
 - Writes `<image>.json` in the **same folder** as the image (always, even without `--troubleshooting`)
 ### LC only
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --cameras lc \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --preprocessing None
 ```
 ### RC only
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --cameras rc \
  --chessboard_size 8,7 --square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045 \
  --preprocessing None
 ```
 ### RG only
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --cameras rg \
  --chessboard_size 8,7 --square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045 \
  --preprocessing None
 ```
 ### IR only
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --cameras ir \
  --chessboard_size 8,7 --square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045 \
  --preprocessing C \
  --ir_mode auto
 ```
 ### LC-IR folder only
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --cameras lc-ir \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --preprocessing None
 ```
 ### All cameras
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045 \
  --preprocessing None \
  --ir_mode auto
 ```
 ### Step 1 with troubleshooting
 ```bash
 python detect_features.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --cameras lc,rc,ir \
  --chessboard_size 8,7 --square_size 0.045 \
  --preprocessing C \
  --ir_mode auto \
  --troubleshooting
 ```
 ### JSON contents (chessboard example)
 ```json
 {
  "version": 1,
  "image": "lc_1778599872850705.bmp",
  "camera_folder": "lc",
  "feature_type": "chessboard",
  "success": true,
  "board_size": [8, 7],
  "square_size": 0.045,
  "timestamp_sec": 1778599872.850705,
  "pair_key": "1778599872850705",
  "corners": [[412.3, 287.1], ...]
 }
 ```
 ---
 ## Step 2 — Calibration
 ### 2a. Mono intrinsics
 Reads chessboard JSONs from each camera folder, runs `cv2.calibrateCamera`, saves:
 - `params/<camera>_intrinsics.npz`
 - `params/<camera>_intrinsics.yaml`
 Requires **≥ 3** successful chessboard detections per camera.
 ### 2b. Stereo calibration
 - **Left camera:** `lc` by default (`--left_camera`)
 - **Partners:** `rc`, `rg`, `ir` — each available folder is calibrated against lc
 - **Pairing:** time-window match (`--time_window`, default **0.1 s**), then filename `pair_key` fallback for IR scan ids
 - Uses mono intrinsics with `CALIB_FIX_INTRINSIC`
 - Saves `lc-rc_*`, `lc-rg_*`, `lc-ir_*` under `params/`
 ### Full step 2 (mono + all stereo pairs)
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step all \
  --left_camera lc \
  --partners rc,rg,ir \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045
 ```
 ### Stereo: LC ↔ RC only
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step stereo \
  --left_camera lc \
  --partners rc \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045
 ```
 ### Stereo: LC ↔ RG only
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step stereo \
  --left_camera lc \
  --partners rg \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045
 ```
 ### Stereo: LC ↔ IR only
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step stereo \
  --left_camera lc \
  --partners ir \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045
 ```
 ### Stereo: left = LC-IR folder
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step stereo \
  --left_camera lc-ir \
  --partners rc,rg,ir \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045
 ```
 ### Mono intrinsics only
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step mono \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045
 ```
 ### Step 2 with troubleshooting
 ```bash
 python calibrate.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step all \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045 \
  --troubleshooting
 ```
 Writes `params/` plus `pairing_reports/<pair>.txt` and `rectified/<pair>/`.
 ---
 ## Full pipeline (`main.py`)
 ```bash
 python main.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --step all \
  --left_camera lc \
  --partners rc,rg,ir \
  --time_window 0.1 \
  --chessboard_size 8,7 --square_size 0.045 \
  --left_chessboard_size 8,7 --left_square_size 0.045 \
  --right_chessboard_size 8,7 --right_square_size 0.045 \
  --preprocessing None \
  --ir_mode auto
 ```
 With troubleshooting:
 ```bash
 python main.py \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --troubleshooting \
  --chessboard_size 8,7 --square_size 0.045
 ```
 ---
 ## Legacy one-shot mode
 The old in-memory flow (single `--right_camera`, filename pairing) still works:
 ```bash
 # LC-RC
 python main.py --legacy \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --left_camera lc --right_camera rc \
  --chessboard_size 8,7 --square_size 0.045
 # LC-RG
 python main.py --legacy \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --left_camera lc --right_camera rg \
  --chessboard_size 8,7 --square_size 0.045
 # LC-IR
 python main.py --legacy \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --left_camera lc --right_camera ir \
  --chessboard_size 8,7 --square_size 0.045 \
  --preprocessing C
 # LC-IR folder + IR partner (with debug output)
 python main.py --legacy \
  --project Olsen_wings --date 2026-05-12 --calib_name calib1 \
  --left_camera lc-ir --right_camera ir \
  --chessboard_size 8,7 --square_size 0.045 \
  --preprocessing C --troubleshooting
 ```
 ---
 ## Dependencies
 ```bash
 pip install -r ~/Speckle-Scanner/02_Calibration/requirements.txt
 # or full pipeline:
 pip install -r ~/Speckle-Scanner/requirements.txt
 ```
 ---
 ## Notes
 - Stereo pairing uses **timestamps** parsed from filenames (`ts…` tokens or long numeric ids); `ck…` suffixes are ignored.
 - **Ellipse-only** IR JSONs are stored but cannot produce mono intrinsics (need full chessboard grids). Use chessboard IR images for calibration.
 - Per-camera board overrides apply to detection and calibration (`--left_chessboard_size`, etc.).
 - Re-run **step 1** if images change; re-run **step 2** freely when tuning `time_window` or partners.
 - With `--troubleshooting` off, step 2 writes **only** `params/` (no `pairing_reports/`, no `rectified/`).
@@ -0,0 +1,106 @@
 #!/usr/bin/env python3
 """
 Step 2 — Calibration from per-image JSON feature files.
  2a. Mono intrinsics per camera folder
  2b. Stereo calibration: left camera vs each available partner (rc, rg, ir)
      with time-window pairing (default 0.1 s)
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path.home() / "Speckle-Scanner"))
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 import argparse
 from calibrationclasses.calibration_engine import (
    run_mono_calibration,
    run_stereo_calibration,
 )
 from calibrationclasses.cli_common import (
    add_board_args,
    add_session_args,
    add_troubleshooting_arg,
    build_board_config,
    resolve_input_path,
 )
 from calibrationclasses.session import STEREO_PARTNERS
 def main():
    parser = argparse.ArgumentParser(
        description="Calibration step 2: mono + stereo calibration from JSON"
    )
    add_session_args(parser)
    add_board_args(parser)
    parser.add_argument(
        "--step",
        choices=("mono", "stereo", "all"),
        default="all",
        help="Run mono intrinsics, stereo pairs, or both (default: all)",
    )
    parser.add_argument(
        "--left_camera",
        default="lc",
        choices=("lc", "lc-ir", "lc_ir"),
        help="Left camera for stereo calibration (default: lc)",
    )
    parser.add_argument(
        "--time_window",
        type=float,
        default=0.1,
        help="Max |t_left - t_right| in seconds for stereo pairing (default: 0.1)",
    )
    parser.add_argument(
        "--partners",
        type=str,
        default="rc,rg,ir",
        help="Comma-separated right cameras for stereo (default: rc,rg,ir)",
    )
    add_troubleshooting_arg(parser)
    args = parser.parse_args()
    left_camera = args.left_camera.lower().replace("_", "-")
    partners = tuple(p.strip() for p in args.partners.split(",") if p.strip())
    board_sizes, square_sizes = build_board_config(args)
    input_path = resolve_input_path(args)
    print(f"[calibrate] session: {input_path}")
    mono_results = {}
    if args.step in ("mono", "all"):
        print("\n=== Step 2a: Mono intrinsics ===")
        mono_results = run_mono_calibration(
            input_path,
            board_sizes,
            square_sizes,
            troubleshooting=args.troubleshooting,
        )
    if args.step in ("stereo", "all"):
        print("\n=== Step 2b: Stereo calibration ===")
        if not mono_results and args.step == "stereo":
            mono_results = run_mono_calibration(
                input_path,
                board_sizes,
                square_sizes,
                troubleshooting=args.troubleshooting,
            )
        run_stereo_calibration(
            input_path,
            left_camera=left_camera,
            mono_results=mono_results,
            board_sizes=board_sizes,
            square_sizes=square_sizes,
            time_window_sec=args.time_window,
            partners=partners or STEREO_PARTNERS,
            troubleshooting=args.troubleshooting,
        )
    print("[calibrate] done")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,424 @@
 """Step 2: mono and stereo calibration from per-image JSON feature files."""
 from __future__ import annotations
 import os
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import cv2
 import numpy as np
 from calibrationclasses.feature_json import FeatureRecord, load_folder_features
 from calibrationclasses.pairing import StereoPair, build_stereo_pairs
 from calibrationclasses.session import (
    CameraFolder,
    discover_camera_folder,
    resolve_session_root,
    STEREO_PARTNERS,
 )
 def create_3d_board_points(board_size: Tuple[int, int], square_size: float) -> np.ndarray:
    pts = np.zeros((np.prod(board_size), 3), np.float32)
    pts[:, :2] = np.indices(board_size).T.reshape(-1, 2)
    pts *= square_size
    return pts
 def _image_size_from_records(records: List[FeatureRecord]) -> Tuple[int, int]:
    for record in records:
        img = cv2.imread(str(record.image_path))
        if img is not None:
            return img.shape[1], img.shape[0]
    raise RuntimeError("Could not determine image size from feature JSONs")
 def calibrate_camera_intrinsics(
    records: List[FeatureRecord],
    board_size: Tuple[int, int],
    square_size: float,
 ) -> Dict:
    chess_records = [r for r in records if r.is_chessboard]
    if len(chess_records) < 3:
        raise RuntimeError(
            f"Need at least 3 chessboard detections for mono calibration, got {len(chess_records)}"
        )
    image_size = _image_size_from_records(chess_records)
    objp = create_3d_board_points(board_size, square_size)
    obj_points = [objp for _ in chess_records]
    img_points = [r.corners for r in chess_records]
    ret, mtx, dist, rvecs, tvecs = cv2.calibrateCamera(
        obj_points, img_points, image_size, None, None, flags=0
    )
    rmtx = []
    tmtx = []
    for k, r in enumerate(rvecs):
        rmtx.append(cv2.Rodrigues(r)[0])
        tmtx.append(np.vstack((np.hstack((rmtx[k], tvecs[k])), np.array([0, 0, 0, 1]))))
    newmtx, roi = cv2.getOptimalNewCameraMatrix(mtx, dist, image_size, 1, image_size)
    if np.sum(roi) == 0:
        roi = (0, 0, image_size[0] - 1, image_size[1] - 1)
    return {
        "Intrinsic": mtx,
        "Distortion": dist,
        "DistortionROI": roi,
        "DistortionIntrinsic": newmtx,
        "RotVektor": rvecs,
        "RotMatrix": rmtx,
        "Extrinsics": tmtx,
        "TransVektor": tvecs,
        "MeanError": float(ret),
        "image_size": image_size,
        "num_views": len(chess_records),
    }
 def save_mono_intrinsics(
    params_dir: Path,
    camera_name: str,
    intrinsics: Dict,
    *,
    troubleshooting: bool = False,
 ) -> None:
    params_dir.mkdir(parents=True, exist_ok=True)
    tag = camera_name.replace("/", "-")
    npz_path = params_dir / f"{tag}_intrinsics.npz"
    np.savez(
        npz_path,
        Intrinsic=intrinsics["Intrinsic"],
        Distortion=intrinsics["Distortion"],
        DistortionIntrinsic=intrinsics["DistortionIntrinsic"],
        DistortionROI=intrinsics["DistortionROI"],
        MeanError=intrinsics["MeanError"],
        image_size=np.array(intrinsics["image_size"]),
        num_views=intrinsics["num_views"],
    )
    yaml_path = params_dir / f"{tag}_intrinsics.yaml"
    fs = cv2.FileStorage(str(yaml_path), cv2.FILE_STORAGE_WRITE)
    fs.write("Intrinsic", intrinsics["Intrinsic"])
    fs.write("Distortion", intrinsics["Distortion"])
    fs.write("DistortionIntrinsic", intrinsics["DistortionIntrinsic"])
    fs.release()
    if troubleshooting:
        print(f"[INFO] Saved mono intrinsics → {npz_path} and {yaml_path}")
 def run_mono_calibration(
    input_path: str | Path,
    board_sizes: Dict[str, Tuple[int, int]],
    square_sizes: Dict[str, float],
    cameras: Optional[List[str]] = None,
    troubleshooting: bool = False,
 ) -> Dict[str, Dict]:
    session_root = resolve_session_root(input_path)
    params_dir = Path(input_path) / "params"
    results = {}
    for logical_name, board_size in board_sizes.items():
        if cameras and logical_name not in cameras:
            continue
        cam = discover_camera_folder(session_root, logical_name)
        if cam is None:
            continue
        records = load_folder_features(cam.path)
        square_size = square_sizes[logical_name]
        try:
            intrinsics = calibrate_camera_intrinsics(records, board_size, square_size)
            save_mono_intrinsics(
                params_dir, logical_name, intrinsics, troubleshooting=troubleshooting
            )
            results[logical_name] = intrinsics
            print(
                f"[mono:{logical_name}] views={intrinsics['num_views']} "
                f"reproj_err={intrinsics['MeanError']:.4f}"
            )
        except RuntimeError as exc:
            if troubleshooting:
                print(f"[SKIP mono:{logical_name}] {exc}")
            else:
                print(f"[mono:{logical_name}] skipped")
    return results
 def calibrate_stereo_pair(
    pairs: List[StereoPair],
    left_intrinsics: Dict,
    right_intrinsics: Dict,
    board_size: Tuple[int, int],
    square_size: float,
    image_size: Tuple[int, int],
 ) -> Dict:
    if not pairs:
        raise RuntimeError("No stereo pairs available")
    objp = create_3d_board_points(board_size, square_size)
    obj_points = [objp for _ in pairs]
    left_img_points = [p.left.corners for p in pairs]
    right_img_points = [p.right.corners for p in pairs]
    flags = cv2.CALIB_FIX_INTRINSIC
    criteria = (cv2.TERM_CRITERIA_MAX_ITER + cv2.TERM_CRITERIA_EPS, 30, 0.001)
    ret_stereo, _, _, _, _, rot, trans, essential, fundamental = cv2.stereoCalibrate(
        obj_points,
        left_img_points,
        right_img_points,
        left_intrinsics["Intrinsic"],
        left_intrinsics["Distortion"],
        right_intrinsics["Intrinsic"],
        right_intrinsics["Distortion"],
        image_size,
        criteria=criteria,
        flags=flags,
    )
    R1, R2, P1, P2, Q, roi1, roi2 = cv2.stereoRectify(
        left_intrinsics["Intrinsic"],
        left_intrinsics["Distortion"],
        right_intrinsics["Intrinsic"],
        right_intrinsics["Distortion"],
        image_size,
        rot,
        trans,
        flags=0,
        alpha=1,
    )
    T = np.vstack((np.hstack((rot, trans)), np.array([0, 0, 0, 1])))
    Q_clean = np.array(Q, dtype=np.float64)
    parameters = {
        "Translation": trans,
        "Rotation": rot,
        "Transformation": T,
        "Essential": essential,
        "Fundamental": fundamental,
        "MeanError": float(ret_stereo),
        "SquareSize": square_size,
        "BoardSize": board_size,
        "Objpoints": objp,
        "Q": Q_clean,
        "num_pairs": len(pairs),
        "L_Intrinsic": left_intrinsics["Intrinsic"],
        "L_Distortion": left_intrinsics["Distortion"],
        "L_DistortionIntrinsic": left_intrinsics["DistortionIntrinsic"],
        "R_Intrinsic": right_intrinsics["Intrinsic"],
        "R_Distortion": right_intrinsics["Distortion"],
        "R_DistortionIntrinsic": right_intrinsics["DistortionIntrinsic"],
        "L_Imgpoints": left_img_points,
        "R_Imgpoints": right_img_points,
        "R1": R1,
        "R2": R2,
        "P1": P1,
        "P2": P2,
        "image_size": image_size,
    }
    return parameters
 def save_stereo_calibration(
    input_path: str | Path,
    pair_tag: str,
    parameters: Dict,
    *,
    troubleshooting: bool = False,
 ) -> None:
    params_dir = Path(input_path) / "params"
    params_dir.mkdir(parents=True, exist_ok=True)
    Q_clean = np.array(parameters["Q"], dtype=np.float64)
    npz_path = params_dir / f"{pair_tag}_parameters.npz"
    save_kwargs = {k: v for k, v in parameters.items() if k not in ("R1", "R2", "P1", "P2")}
    np.savez(npz_path, **save_kwargs)
    if troubleshooting:
        print(f"[INFO] Saved NPZ → {npz_path}")
    yaml_path = params_dir / f"{pair_tag}_stereo_cam_model.yaml"
    fs = cv2.FileStorage(str(yaml_path), cv2.FILE_STORAGE_WRITE)
    fs.write("L_DistortionIntrinsic", parameters["L_DistortionIntrinsic"])
    fs.write("L_Intrinsic", parameters["L_Intrinsic"])
    fs.write("L_Distortion", parameters["L_Distortion"])
    fs.write("R_DistortionIntrinsic", parameters["R_DistortionIntrinsic"])
    fs.write("R_Intrinsic", parameters["R_Intrinsic"])
    fs.write("R_Distortion", parameters["R_Distortion"])
    fs.write("Rotation", parameters["Transformation"][:3, :3])
    fs.write("Translation", parameters["Transformation"][:3, 3:])
    fs.write("Q", Q_clean)
    fs.release()
    if troubleshooting:
        print(f"[INFO] Saved YAML → {yaml_path}")
    cvstore_path = params_dir / f"{pair_tag}_Q.cvstore"
    fs2 = cv2.FileStorage(str(cvstore_path), cv2.FILE_STORAGE_WRITE)
    fs2.write("Q", Q_clean)
    fs2.release()
    if troubleshooting:
        print(f"[INFO] Saved Q → {cvstore_path}")
 def save_pairing_report(
    input_path: str | Path,
    pair_tag: str,
    pairs: List[StereoPair],
 ) -> Path:
    report_dir = Path(input_path) / "pairing_reports"
    report_dir.mkdir(parents=True, exist_ok=True)
    report_path = report_dir / f"{pair_tag}.txt"
    lines = [
        f"# stereo pairs for {pair_tag}",
        f"# total={len(pairs)}",
        "left_image\tright_image\tdelta_sec\tmethod",
    ]
    for pair in pairs:
        lines.append(
            f"{pair.left.image_path.name}\t{pair.right.image_path.name}\t"
            f"{pair.delta_sec:.6f}\t{pair.method}"
        )
    report_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
    print(f"[INFO] Pairing report → {report_path}")
    return report_path
 def save_rectified_pairs(
    input_path: str | Path,
    pair_tag: str,
    pairs: List[StereoPair],
    parameters: Dict,
    left_folder: str,
    right_folder: str,
 ) -> None:
    image_size = parameters["image_size"]
    R1, R2, P1, P2 = parameters["R1"], parameters["R2"], parameters["P1"], parameters["P2"]
    map_left = cv2.initUndistortRectifyMap(
        parameters["L_Intrinsic"],
        parameters["L_Distortion"],
        R1,
        P1,
        image_size,
        cv2.CV_32FC1,
    )
    map_right = cv2.initUndistortRectifyMap(
        parameters["R_Intrinsic"],
        parameters["R_Distortion"],
        R2,
        P2,
        image_size,
        cv2.CV_32FC1,
    )
    out_left = Path(input_path) / "rectified" / pair_tag / left_folder
    out_right = Path(input_path) / "rectified" / pair_tag / right_folder
    out_left.mkdir(parents=True, exist_ok=True)
    out_right.mkdir(parents=True, exist_ok=True)
    saved = 0
    for pair in pairs:
        left_img = cv2.imread(str(pair.left.image_path))
        right_img = cv2.imread(str(pair.right.image_path))
        if left_img is None or right_img is None:
            continue
        left_rect = cv2.remap(left_img, map_left[0], map_left[1], cv2.INTER_LINEAR)
        right_rect = cv2.remap(right_img, map_right[0], map_right[1], cv2.INTER_LINEAR)
        cv2.imwrite(str(out_left / pair.left.image_path.name), left_rect)
        cv2.imwrite(str(out_right / pair.right.image_path.name), right_rect)
        saved += 1
    print(f"[INFO] Rectified {saved}/{len(pairs)} pairs → {out_left.parent}")
 def run_stereo_calibration(
    input_path: str | Path,
    left_camera: str,
    mono_results: Dict[str, Dict],
    board_sizes: Dict[str, Tuple[int, int]],
    square_sizes: Dict[str, float],
    time_window_sec: float = 0.1,
    partners: Tuple[str, ...] = STEREO_PARTNERS,
    troubleshooting: bool = False,
 ) -> None:
    session_root = resolve_session_root(input_path)
    left_cam = discover_camera_folder(session_root, left_camera)
    if left_cam is None:
        raise FileNotFoundError(f"Left camera folder {left_camera!r} not found")
    if left_camera not in mono_results:
        raise RuntimeError(
            f"No mono intrinsics for {left_camera}. Run mono calibration first."
        )
    left_records = load_folder_features(left_cam.path)
    left_board = board_sizes[left_camera]
    left_square = square_sizes[left_camera]
    image_size = mono_results[left_camera]["image_size"]
    for partner in partners:
        right_cam = discover_camera_folder(session_root, partner)
        if right_cam is None:
            if troubleshooting:
                print(f"[SKIP stereo:{left_camera}-{partner}] folder not found")
            continue
        if partner not in mono_results:
            if troubleshooting:
                print(
                    f"[SKIP stereo:{left_camera}-{partner}] "
                    f"no mono intrinsics for {partner}"
                )
            continue
        right_records = load_folder_features(right_cam.path)
        pairs = build_stereo_pairs(left_records, right_records, time_window_sec)
        pair_tag = f"{left_camera}-{partner}"
        if not pairs:
            if troubleshooting:
                print(
                    f"[SKIP stereo:{pair_tag}] no valid pairs "
                    f"(time_window={time_window_sec}s)"
                )
            continue
        time_n = sum(1 for p in pairs if p.method == "time_window")
        key_n = sum(1 for p in pairs if p.method == "pair_key")
        if troubleshooting:
            print(
                f"[stereo:{pair_tag}] {len(pairs)} pairs "
                f"(time_window={time_n}, pair_key={key_n})"
            )
            save_pairing_report(input_path, pair_tag, pairs)
        try:
            params = calibrate_stereo_pair(
                pairs,
                mono_results[left_camera],
                mono_results[partner],
                left_board,
                left_square,
                image_size,
            )
            save_stereo_calibration(
                input_path, pair_tag, params, troubleshooting=troubleshooting
            )
            print(
                f"[stereo:{pair_tag}] pairs={params['num_pairs']} "
                f"reproj_err={params['MeanError']:.4f}"
            )
            if troubleshooting:
                save_rectified_pairs(
                    input_path,
                    pair_tag,
                    pairs,
                    params,
                    left_cam.folder_name,
                    right_cam.folder_name,
                )
        except RuntimeError as exc:
            if troubleshooting:
                print(f"[FAIL stereo:{pair_tag}] {exc}")
            else:
                print(f"[stereo:{pair_tag}] failed")
@@ -0,0 +1,50 @@
 import numpy as np
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 from matplotlib.patches import Patch
 from mpl_toolkits.mplot3d.art3d import Poly3DCollection
 class CameraPoseVisualizer:
    def __init__(self, xlim, ylim, zlim):
        self.fig = plt.figure(figsize=(18, 7))
        self.ax = self.fig.add_subplot(projection='3d')
        self.ax.set_aspect("auto")
        self.ax.set_xlim(xlim)
        self.ax.set_ylim(ylim)
        self.ax.set_zlim(zlim)
        self.ax.set_xlabel('x')
        self.ax.set_ylabel('y')
        self.ax.set_zlabel('z')
        print('initialize camera pose visualizer')
    def extrinsic2pyramid(self, extrinsic, color='r', focal_len_scaled=5, aspect_ratio=0.3):
        vertex_std = np.array([[0, 0, 0, 1],
                               [focal_len_scaled * aspect_ratio, -focal_len_scaled * aspect_ratio, focal_len_scaled, 1],
                               [focal_len_scaled * aspect_ratio, focal_len_scaled * aspect_ratio, focal_len_scaled, 1],
                               [-focal_len_scaled * aspect_ratio, focal_len_scaled * aspect_ratio, focal_len_scaled, 1],
                               [-focal_len_scaled * aspect_ratio, -focal_len_scaled * aspect_ratio, focal_len_scaled, 1]])
        vertex_transformed = vertex_std @ extrinsic.T
        meshes = [[vertex_transformed[0, :-1], vertex_transformed[1][:-1], vertex_transformed[2, :-1]],
                            [vertex_transformed[0, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1]],
                            [vertex_transformed[0, :-1], vertex_transformed[3, :-1], vertex_transformed[4, :-1]],
                            [vertex_transformed[0, :-1], vertex_transformed[4, :-1], vertex_transformed[1, :-1]],
                            [vertex_transformed[1, :-1], vertex_transformed[2, :-1], vertex_transformed[3, :-1], vertex_transformed[4, :-1]]]
        self.ax.add_collection3d(
            Poly3DCollection(meshes, facecolors=color, linewidths=0.3, edgecolors=color, alpha=0.35))
    def customize_legend(self, list_label):
        list_handle = []
        for idx, label in enumerate(list_label):
            color = plt.cm.rainbow(idx / len(list_label))
            patch = Patch(color=color, label=label)
            list_handle.append(patch)
        plt.legend(loc='right', bbox_to_anchor=(1.8, 0.5), handles=list_handle)
    def colorbar(self, max_frame_length):
        cmap = mpl.cm.rainbow
        norm = mpl.colors.Normalize(vmin=0, vmax=max_frame_length)
        self.fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap), orientation='vertical', label='Frame Number')
    def show(self):
        plt.title('Extrinsic Parameters')
        plt.show()
@@ -0,0 +1,93 @@
 """Shared CLI helpers for calibration scripts."""
 from __future__ import annotations
 import argparse
 from pathlib import Path
 from typing import Dict, Optional, Tuple
 import config
 def parse_chessboard_size(value: str) -> Tuple[int, int]:
    parts = value.split(",")
    if len(parts) != 2:
        raise argparse.ArgumentTypeError(
            "chessboard size must be width,height (e.g. 8,7)"
        )
    return tuple(map(int, parts))
 def add_session_args(parser: argparse.ArgumentParser) -> None:
    parser.add_argument("--project", required=True, help="Project name (e.g. Olsen_wings)")
    parser.add_argument("--date", required=True, help="Date string (e.g. 2026-05-12)")
    parser.add_argument(
        "--calib_name", default="calib1", help="Calibration folder name (default: calib1)"
    )
 def add_board_args(parser: argparse.ArgumentParser) -> None:
    parser.add_argument(
        "--chessboard_size",
        type=parse_chessboard_size,
        default="8,7",
        help="Default inner corner grid width,height",
    )
    parser.add_argument(
        "--square_size",
        type=float,
        default=0.045,
        help="Default chessboard square size in metres",
    )
    parser.add_argument("--left_chessboard_size", type=parse_chessboard_size, default=None)
    parser.add_argument("--right_chessboard_size", type=parse_chessboard_size, default=None)
    parser.add_argument("--left_square_size", type=float, default=None)
    parser.add_argument("--right_square_size", type=float, default=None)
    parser.add_argument(
        "--preprocessing",
        type=str,
        default="None",
        help="Pre-detection chain: G=gray, C=CLAHE, T=threshold (e.g. C, GC)",
    )
 def add_troubleshooting_arg(parser: argparse.ArgumentParser) -> None:
    parser.add_argument(
        "--troubleshooting",
        action="store_true",
        help=(
            "Verbose logs and intermediate debug files (corners/, pairing_reports/, "
            "rectified/). Default: minimal logs; step 2 writes only params/"
        ),
    )
 def resolve_input_path(args) -> Path:
    return config.CALIB_DATA_DIR / args.project / args.date / args.calib_name
 def build_board_config(args) -> Tuple[Dict[str, Tuple[int, int]], Dict[str, float]]:
    default_board = args.chessboard_size
    default_square = args.square_size
    left_board = args.left_chessboard_size or default_board
    right_board = args.right_chessboard_size or default_board
    left_square = args.left_square_size if args.left_square_size is not None else default_square
    right_square = (
        args.right_square_size if args.right_square_size is not None else default_square
    )
    board_sizes = {
        "lc": left_board,
        "lc-ir": left_board,
        "rc": right_board,
        "rg": right_board,
        "ir": right_board,
    }
    square_sizes = {
        "lc": left_square,
        "lc-ir": left_square,
        "rc": right_square,
        "rg": right_square,
        "ir": right_square,
    }
    return board_sizes, square_sizes
@@ -0,0 +1,280 @@
 """Step 1: detect chessboard corners / ellipse centers and write per-image JSON."""
 from __future__ import annotations
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional, Tuple
 import cv2
 import numpy as np
 from tqdm import tqdm
 from calibrationclasses.feature_json import FeatureRecord, save_feature_json
 from calibrationclasses.preprocessing import Preprocessing
 from calibrationclasses.session import (
    CameraFolder,
    json_path_for_image,
    list_cameras_present,
    list_image_paths,
    resolve_session_root,
 )
 from calibrationclasses.timestamp import parse_pair_key, parse_timestamp_sec
@dataclass
 class DetectionConfig:
    chessboard_size: Tuple[int, int] = (8, 7)
    square_size: float = 0.045
    preprocessing: str = "None"
    ir_mode: str = "auto"  # auto | chessboard | ellipse
    troubleshooting: bool = False
 class FeatureDetector:
    def __init__(self, config: DetectionConfig, corners_root: Optional[Path] = None):
        self.config = config
        self._preprocessor = Preprocessing()
        self.corners_root = corners_root
    def _preprocessing_enabled(self) -> bool:
        spec = (self.config.preprocessing or "").strip().lower()
        return bool(spec) and spec not in ("none", "off", "false", "0")
    def _preprocess(self, image: np.ndarray) -> np.ndarray:
        if image is None or not self._preprocessing_enabled():
            return image
        spec = (
            (self.config.preprocessing or "")
            .strip()
            .lower()
            .replace("none", "")
            .replace(",", "")
            .replace(" ", "")
        )
        out = image
        pp = self._preprocessor
        for ch in spec:
            if ch == "g":
                g = pp.gray(out)
                out = cv2.cvtColor(g, cv2.COLOR_GRAY2BGR)
            elif ch == "c":
                c = pp.clahe(out)
                out = cv2.cvtColor(c, cv2.COLOR_GRAY2BGR)
            elif ch == "t":
                t = pp.threshold(out)
                out = cv2.cvtColor(t, cv2.COLOR_GRAY2BGR)
        return out
    @staticmethod
    def _to_gray(image: np.ndarray) -> np.ndarray:
        if len(image.shape) == 2:
            return image
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    def detect_chessboard(
        self, image: np.ndarray, board_size: Tuple[int, int]
    ) -> Optional[np.ndarray]:
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001)
        found, corners = cv2.findChessboardCorners(image, board_size, None)
        if not found:
            return None
        corners = cv2.cornerSubPix(
            self._to_gray(image), corners, (11, 11), (-1, -1), criteria
        )
        return corners
    def detect_ellipse(self, image: np.ndarray):
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(gray)
        blurred = cv2.GaussianBlur(enhanced, (5, 5), 0)
        _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        if np.sum(binary == 255) / binary.size > 0.5:
            binary = cv2.bitwise_not(binary)
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (9, 9))
        closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
        contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        if not contours:
            return None
        valid = []
        for cnt in contours:
            area = cv2.contourArea(cnt)
            if area < 100:
                continue
            x, y, w, h = cv2.boundingRect(cnt)
            if 0.75 < (w / h) < 1.25:
                valid.append(cnt)
        if not valid:
            return None
        best = max(valid, key=cv2.contourArea)
        if len(best) < 5:
            return None
        ellipse = cv2.fitEllipse(best)
        (cx, cy), (major, minor), angle = ellipse
        return (cx, cy), {
            "center": [float(cx), float(cy)],
            "axes": [float(major), float(minor)],
            "angle": float(angle),
        }
    def _save_corner_overlay(
        self,
        image: np.ndarray,
        record: FeatureRecord,
        board_size: Tuple[int, int],
    ) -> None:
        if self.corners_root is None:
            return
        out_dir = self.corners_root / record.camera_folder
        out_dir.mkdir(parents=True, exist_ok=True)
        vis = image.copy()
        if record.feature_type == "chessboard" and record.corners is not None:
            vis = cv2.drawChessboardCorners(vis, board_size, record.corners, True)
        elif record.feature_type == "ellipse" and record.center is not None:
            cx, cy = record.center
            cv2.circle(vis, (int(cx), int(cy)), 12, (0, 255, 0), 2)
        out_path = out_dir / record.image_path.name
        cv2.imwrite(str(out_path), vis)
    def process_image(
        self,
        image_path: Path,
        camera: CameraFolder,
        board_size: Optional[Tuple[int, int]] = None,
        square_size: Optional[float] = None,
    ) -> FeatureRecord:
        board_size = board_size or self.config.chessboard_size
        square_size = square_size if square_size is not None else self.config.square_size
        json_path = json_path_for_image(image_path)
        base = FeatureRecord(
            image_path=image_path,
            json_path=json_path,
            camera_folder=camera.folder_name,
            feature_type="unknown",
            success=False,
            preprocessing=self.config.preprocessing,
            timestamp_sec=parse_timestamp_sec(image_path.name),
            pair_key=parse_pair_key(image_path.name),
        )
        image = cv2.imread(str(image_path))
        if image is None:
            base.error = "failed to load image"
            save_feature_json(base)
            return base
        proc = self._preprocess(image)
        use_ellipse = camera.logical_name == "ir" and self.config.ir_mode in (
            "ellipse",
            "auto",
        )
        if not use_ellipse or self.config.ir_mode == "auto":
            corners = self.detect_chessboard(proc, board_size)
            if corners is not None:
                record = FeatureRecord(
                    image_path=image_path,
                    json_path=json_path,
                    camera_folder=camera.folder_name,
                    feature_type="chessboard",
                    success=True,
                    board_size=board_size,
                    square_size=square_size,
                    corners=corners,
                    preprocessing=self.config.preprocessing,
                    timestamp_sec=base.timestamp_sec,
                    pair_key=base.pair_key,
                )
                save_feature_json(record)
                if self.config.troubleshooting:
                    self._save_corner_overlay(image, record, board_size)
                return record
        if use_ellipse:
            result = self.detect_ellipse(image)
            if result is not None:
                (cx, cy), ellipse = result
                record = FeatureRecord(
                    image_path=image_path,
                    json_path=json_path,
                    camera_folder=camera.folder_name,
                    feature_type="ellipse",
                    success=True,
                    center=(cx, cy),
                    ellipse=ellipse,
                    preprocessing=self.config.preprocessing,
                    timestamp_sec=base.timestamp_sec,
                    pair_key=base.pair_key,
                )
                save_feature_json(record)
                if self.config.troubleshooting:
                    self._save_corner_overlay(image, record, board_size)
                return record
        base.feature_type = "chessboard" if not use_ellipse else "ellipse"
        base.error = "no features detected"
        save_feature_json(base)
        if self.config.troubleshooting:
            print(f"[detect] FAIL {image_path.name}: {base.error}")
        return base
    def process_camera(
        self,
        camera: CameraFolder,
        board_size: Optional[Tuple[int, int]] = None,
        square_size: Optional[float] = None,
    ) -> Tuple[int, int]:
        images = list_image_paths(camera.path)
        if not images:
            print(f"[WARN] No images in {camera.path}")
            return 0, 0
        detected = 0
        iterator = (
            tqdm(images, unit="img", dynamic_ncols=True)
            if self.config.troubleshooting
            else images
        )
        for image_path in iterator:
            record = self.process_image(
                image_path, camera, board_size=board_size, square_size=square_size
            )
            if record.success:
                detected += 1
            if self.config.troubleshooting and hasattr(iterator, "set_description"):
                iterator.set_description(
                    f"{camera.logical_name} | detected {detected}/{len(images)}"
                )
        print(f"[{camera.logical_name}] {detected}/{len(images)} features detected")
        return detected, len(images)
 def run_detection(
    input_path: str | Path,
    config: DetectionConfig,
    cameras: Optional[list[str]] = None,
    per_camera_board: Optional[dict] = None,
 ) -> None:
    session_root = resolve_session_root(input_path)
    present = list_cameras_present(session_root)
    if cameras:
        wanted = set(cameras)
        present = [c for c in present if c.logical_name in wanted]
    if not present:
        raise FileNotFoundError(f"No camera folders found under {session_root}")
    corners_root = None
    if config.troubleshooting:
        corners_root = Path(input_path) / "corners"
        print(f"[detect] troubleshooting: corner overlays → {corners_root}")
    detector = FeatureDetector(config, corners_root=corners_root)
    per_camera_board = per_camera_board or {}
    for camera in present:
        board = per_camera_board.get(camera.logical_name, {}).get("board_size")
        square = per_camera_board.get(camera.logical_name, {}).get("square_size")
        detector.process_camera(camera, board_size=board, square_size=square)
@@ -0,0 +1,136 @@
 """JSON schema for per-image feature detection results."""
 from __future__ import annotations
 import json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 import numpy as np
 FEATURE_JSON_VERSION = 1
@dataclass
 class FeatureRecord:
    image_path: Path
    json_path: Path
    camera_folder: str
    feature_type: str
    success: bool
    board_size: Optional[Tuple[int, int]] = None
    square_size: Optional[float] = None
    corners: Optional[np.ndarray] = None  # Nx1x2 float32
    center: Optional[Tuple[float, float]] = None
    ellipse: Optional[Dict[str, Any]] = None
    timestamp_sec: Optional[float] = None
    pair_key: Optional[str] = None
    preprocessing: Optional[str] = None
    error: Optional[str] = None
    @property
    def is_chessboard(self) -> bool:
        return self.success and self.feature_type == "chessboard" and self.corners is not None
    @property
    def corner_count(self) -> int:
        if self.corners is None:
            return 0
        return int(self.corners.shape[0])
 def corners_to_list(corners: np.ndarray) -> List[List[float]]:
    flat = corners.reshape(-1, 2)
    return [[float(x), float(y)] for x, y in flat]
 def corners_from_list(data: List[List[float]]) -> np.ndarray:
    arr = np.array(data, dtype=np.float32).reshape(-1, 1, 2)
    return arr
 def save_feature_json(record: FeatureRecord) -> None:
    payload: Dict[str, Any] = {
        "version": FEATURE_JSON_VERSION,
        "image": record.image_path.name,
        "camera_folder": record.camera_folder,
        "feature_type": record.feature_type,
        "success": record.success,
        "preprocessing": record.preprocessing,
        "timestamp_sec": record.timestamp_sec,
        "pair_key": record.pair_key,
    }
    if record.board_size is not None:
        payload["board_size"] = [int(record.board_size[0]), int(record.board_size[1])]
    if record.square_size is not None:
        payload["square_size"] = float(record.square_size)
    if record.corners is not None:
        payload["corners"] = corners_to_list(record.corners)
    if record.center is not None:
        payload["center"] = [float(record.center[0]), float(record.center[1])]
    if record.ellipse is not None:
        payload["ellipse"] = record.ellipse
    if record.error:
        payload["error"] = record.error
    record.json_path.parent.mkdir(parents=True, exist_ok=True)
    with open(record.json_path, "w", encoding="utf-8") as f:
        json.dump(payload, f, indent=2)
 def load_feature_json(json_path: Path, image_path: Optional[Path] = None) -> FeatureRecord:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if image_path is not None:
        img = Path(image_path)
    else:
        stem = json_path.stem
        parent = json_path.parent
        img = parent / data.get("image", stem)
        if not img.exists():
            for ext in (".bmp", ".png", ".jpg", ".jpeg"):
                candidate = parent / f"{stem}{ext}"
                if candidate.exists():
                    img = candidate
                    break
    board_size = None
    if "board_size" in data and data["board_size"]:
        board_size = (int(data["board_size"][0]), int(data["board_size"][1]))
    corners = None
    if data.get("corners"):
        corners = corners_from_list(data["corners"])
    center = None
    if data.get("center"):
        center = (float(data["center"][0]), float(data["center"][1]))
    return FeatureRecord(
        image_path=Path(img),
        json_path=Path(json_path),
        camera_folder=data.get("camera_folder", ""),
        feature_type=data.get("feature_type", "unknown"),
        success=bool(data.get("success", False)),
        board_size=board_size,
        square_size=data.get("square_size"),
        corners=corners,
        center=center,
        ellipse=data.get("ellipse"),
        timestamp_sec=data.get("timestamp_sec"),
        pair_key=data.get("pair_key"),
        preprocessing=data.get("preprocessing"),
        error=data.get("error"),
    )
 def load_folder_features(camera_dir: Path) -> List[FeatureRecord]:
    records = []
    for json_path in sorted(camera_dir.glob("*.json")):
        try:
            records.append(load_feature_json(json_path))
        except (json.JSONDecodeError, OSError) as exc:
            print(f"[WARN] Skipping invalid JSON {json_path}: {exc}")
    return records
@@ -0,0 +1,106 @@
 """Stereo pair building: time-window matching with filename-key fallback."""
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 from calibrationclasses.feature_json import FeatureRecord
@dataclass(frozen=True)
 class StereoPair:
    left: FeatureRecord
    right: FeatureRecord
    delta_sec: float
    method: str  # "time_window" | "pair_key"
 def _chessboard_compatible(left: FeatureRecord, right: FeatureRecord) -> bool:
    if not left.is_chessboard or not right.is_chessboard:
        return False
    return left.corner_count == right.corner_count
 def pair_by_time_window(
    left_records: List[FeatureRecord],
    right_records: List[FeatureRecord],
    window_sec: float,
 ) -> List[StereoPair]:
    """Match each left image to the closest unused right image within window_sec."""
    pairs: List[StereoPair] = []
    used_right: set[int] = set()
    left_sorted = sorted(
        [r for r in left_records if r.is_chessboard and r.timestamp_sec is not None],
        key=lambda r: r.timestamp_sec,
    )
    right_candidates = [
        (i, r)
        for i, r in enumerate(right_records)
        if r.is_chessboard and r.timestamp_sec is not None
    ]
    for left in left_sorted:
        best_idx = None
        best_dt = None
        for idx, right in right_candidates:
            if idx in used_right:
                continue
            if not _chessboard_compatible(left, right):
                continue
            dt = abs(left.timestamp_sec - right.timestamp_sec)
            if dt <= window_sec and (best_dt is None or dt < best_dt):
                best_idx = idx
                best_dt = dt
        if best_idx is not None:
            used_right.add(best_idx)
            right = right_candidates[best_idx][1]
            pairs.append(StereoPair(left, right, best_dt, "time_window"))
    return pairs
 def pair_by_key(
    left_records: List[FeatureRecord],
    right_records: List[FeatureRecord],
 ) -> List[StereoPair]:
    """Legacy exact pair_key matching (IR scan ids, shared numeric suffix)."""
    right_lookup: Dict[str, FeatureRecord] = {}
    for right in right_records:
        if right.is_chessboard and right.pair_key:
            right_lookup[right.pair_key] = right
    pairs: List[StereoPair] = []
    used_right: set[str] = set()
    for left in left_records:
        if not left.is_chessboard or not left.pair_key:
            continue
        right = right_lookup.get(left.pair_key)
        if right is None or left.pair_key in used_right:
            continue
        if not _chessboard_compatible(left, right):
            continue
        used_right.add(left.pair_key)
        pairs.append(StereoPair(left, right, 0.0, "pair_key"))
    return pairs
 def build_stereo_pairs(
    left_records: List[FeatureRecord],
    right_records: List[FeatureRecord],
    time_window_sec: float = 0.1,
 ) -> List[StereoPair]:
    """
    Prefer time-window pairs; fill remaining with pair_key matches not already paired.
    """
    time_pairs = pair_by_time_window(left_records, right_records, time_window_sec)
    paired_left = {p.left.image_path for p in time_pairs}
    paired_right = {p.right.image_path for p in time_pairs}
    remaining_left = [r for r in left_records if r.image_path not in paired_left]
    remaining_right = [r for r in right_records if r.image_path not in paired_right]
    key_pairs = pair_by_key(remaining_left, remaining_right)
    return time_pairs + key_pairs
@@ -0,0 +1,82 @@
 from typing import List, Tuple
 import cv2
 import numpy as np
 class Preprocessing:
    """Preprocessing class.
    Parameters
    ----------
    clipLimit: float
        default = 5.0
    tileGridSize: Tuple[int, int]
        default = (15, 15)
    thresh1: int
        default = 0
    thresh2: int
        default = 255
    """
    def __init__(
        self,
        tileGridSize: Tuple[int, int] = (15, 15),
        clipLimit: float = 5.0,
        thresh1: int = 0,
        thresh2: int = 255,
    ) -> None:
        self.tileGridSize = tileGridSize
        self.clipLimit = clipLimit
        self.thresh1 = thresh1
        self.thresh2 = thresh2
    def gray(self, image: np.ndarray) -> np.ndarray:
        """Convert to GRAY for a given image.
        Parameters
        ----------
        image : np.ndarray
            image of chessboard
        Returns
        -------
        np.ndarray
            image of chessboard converted to GRAY
        """
        gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
        return gray
    def clahe(self, image: np.ndarray) -> np.ndarray:
        """Apply Clahe to GRAY Shimage.
        Parameters
        ----------
        image : np.ndarray
            image of chessboard
        Returns
        -------
        np.ndarray
            image of chessboard converted to GRAY and applied CLAHE
        """
        clahe = cv2.createCLAHE(clipLimit = self.clipLimit, tileGridSize = self.tileGridSize)
        clahed = clahe.apply(self.gray(image))
        return clahed
    def threshold(self, image: np.ndarray) -> np.ndarray:
        """Apply Clahe to GRAY Shimage.
        Parameters
        ----------
        image : np.ndarray
            image of chessboard
        Returns
        -------
        np.ndarray
            image of chessboard converted to GRAY applied CLAHE and applied THRESHOLD
        """
        criteria = cv2.THRESH_BINARY + cv2.THRESH_OTSU+1
        ret, threshold  = cv2.threshold(self.clahe(image), self.thresh1, self.thresh2, criteria)
        return threshold
@@ -0,0 +1,72 @@
 """Calibration session path resolution and camera folder discovery."""
 from __future__ import annotations
 import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 IMAGE_EXTENSIONS = (".bmp", ".png", ".jpg", ".jpeg")
 # Logical camera name -> folder aliases on disk
 CAMERA_FOLDER_ALIASES: Dict[str, Tuple[str, ...]] = {
    "lc": ("lc",),
    "lc-ir": ("lc-ir", "lc_ir", "LC-IR"),
    "rc": ("rc",),
    "rg": ("rg", "rgb"),
    "ir": ("ir", "IR"),
 }
 STEREO_PARTNERS = ("rc", "rg", "ir")
@dataclass(frozen=True)
 class CameraFolder:
    logical_name: str
    path: Path
    folder_name: str
 def resolve_session_root(input_path: str | Path) -> Path:
    """Return flat or nested `images/` root containing camera folders."""
    input_path = Path(input_path)
    images_dir = input_path / "images"
    if images_dir.is_dir():
        return images_dir
    return input_path
 def discover_camera_folder(
    session_root: Path, logical_name: str
 ) -> Optional[CameraFolder]:
    aliases = CAMERA_FOLDER_ALIASES.get(logical_name)
    if not aliases:
        return None
    for folder in aliases:
        path = session_root / folder
        if path.is_dir():
            return CameraFolder(logical_name, path, folder)
    return None
 def list_image_paths(camera_dir: Path) -> List[Path]:
    paths = [
        camera_dir / name
        for name in os.listdir(camera_dir)
        if name.lower().endswith(IMAGE_EXTENSIONS)
    ]
    return sorted(paths)
 def json_path_for_image(image_path: Path) -> Path:
    return image_path.with_suffix(".json")
 def list_cameras_present(session_root: Path) -> List[CameraFolder]:
    found = []
    for logical in CAMERA_FOLDER_ALIASES:
        cam = discover_camera_folder(session_root, logical)
        if cam is not None:
            found.append(cam)
    return found
@@ -0,0 +1,80 @@
 """Parse timestamps and pairing keys from calibration image filenames."""
 from __future__ import annotations
 import re
 from pathlib import Path
 from typing import Optional, Tuple
 _TS_TOKEN = re.compile(r"ts(\d+)", re.IGNORECASE)
 _SCAN_TOKEN = re.compile(r"scan(\d{6})", re.IGNORECASE)
 _IR_SCAN = re.compile(r"^ir_scan_(\d+)", re.IGNORECASE)
 def _digits_after_prefix(name: str, prefixes: Tuple[str, ...]) -> Optional[str]:
    lower = name.lower()
    for prefix in sorted(prefixes, key=len, reverse=True):
        if lower.startswith(prefix):
            remainder = lower[len(prefix) :].lstrip("_-.")
            m = re.match(r"(\d+)", remainder)
            if m:
                return m.group(1)
    return None
 def parse_timestamp_sec(filename: str) -> Optional[float]:
    """
    Normalize filename timestamps to seconds for time-window pairing.
    Supports:
      - lc_ts1634840093_ck.... -> ms since epoch
      - lc_1778599872850705.bmp -> µs since epoch (16+ digits)
      - lc_1778599872850.bmp -> ms (13 digits)
    """
    name = Path(filename).name
    m = _TS_TOKEN.search(name)
    if m:
        digits = m.group(1)
        if len(digits) >= 16:
            return int(digits) / 1_000_000.0
        if len(digits) >= 13:
            return int(digits) / 1_000.0
        return int(digits) / 1_000.0
    prefixes = ("lc-ir", "lcir", "lc_ir", "lc", "rc", "rg", "rgb", "ir")
    digits = _digits_after_prefix(name, prefixes)
    if digits is None:
        return None
    if len(digits) >= 16:
        return int(digits) / 1_000_000.0
    if len(digits) >= 13:
        return int(digits) / 1_000.0
    return float(digits)
 def parse_pair_key(filename: str) -> Optional[str]:
    """
    Filename key for legacy exact matching (IR scan ids, shared numeric tails).
    """
    name = Path(filename).name
    lower = name.lower()
    m = _IR_SCAN.match(lower)
    if m:
        return f"scan{int(m.group(1)):06d}"
    m = _SCAN_TOKEN.search(lower)
    if m:
        return m.group(0).lower()
    m = _TS_TOKEN.search(lower)
    if m:
        return f"ts{m.group(1)}"
    prefixes = ("lc-ir", "lcir", "lc_ir", "lc", "rc", "rg", "rgb", "ir")
    digits = _digits_after_prefix(lower, prefixes)
    if digits:
        return digits
    return Path(lower).stem
@@ -0,0 +1,73 @@
 #!/usr/bin/env python3
 """
 Step 1 — Feature detection for calibration.
 Detects chessboard corners (and ellipse centers for IR when needed) and writes
 one JSON per image next to the source file in the same camera folder.
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path.home() / "Speckle-Scanner"))
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 import argparse
 from calibrationclasses.cli_common import (
    add_board_args,
    add_session_args,
    add_troubleshooting_arg,
    build_board_config,
    resolve_input_path,
 )
 from calibrationclasses.feature_detection import DetectionConfig, run_detection
 def main():
    parser = argparse.ArgumentParser(
        description="Calibration step 1: detect features and save per-image JSON"
    )
    add_session_args(parser)
    add_board_args(parser)
    parser.add_argument(
        "--cameras",
        type=str,
        default=None,
        help="Comma-separated camera folders to process (default: all present)",
    )
    parser.add_argument(
        "--ir_mode",
        choices=("auto", "chessboard", "ellipse"),
        default="auto",
        help="IR detection: try chessboard first (auto), or force one mode",
    )
    add_troubleshooting_arg(parser)
    args = parser.parse_args()
    board_sizes, square_sizes = build_board_config(args)
    per_camera_board = {
        name: {"board_size": board_sizes[name], "square_size": square_sizes[name]}
        for name in board_sizes
    }
    cameras = None
    if args.cameras:
        cameras = [c.strip() for c in args.cameras.split(",") if c.strip()]
    config = DetectionConfig(
        chessboard_size=args.chessboard_size,
        square_size=args.square_size,
        preprocessing=args.preprocessing,
        ir_mode=args.ir_mode,
        troubleshooting=args.troubleshooting,
    )
    input_path = resolve_input_path(args)
    print(f"[detect] session: {input_path}")
    run_detection(input_path, config, cameras=cameras, per_camera_board=per_camera_board)
    print("[detect] done")
 if __name__ == "__main__":
    main()
@@ -0,0 +1,205 @@
 #!/usr/bin/env python3
 """
 Calibration entry point.
 Default (2-step pipeline):
  1. detect_features.py — corners/ellipses → per-image JSON
  2. calibrate.py       — mono intrinsics + stereo (lc vs rc/rg/ir)
 Legacy one-shot mode: --legacy (detect + calibrate in memory, single partner)
 """
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path.home() / "Speckle-Scanner"))
 sys.path.insert(0, str(Path(__file__).resolve().parent))
 import argparse
 import threading
 from typing import Optional, Tuple
 import config
 from calibrationclasses.calibration import StereoCalibration
 from calibrationclasses.calibration_engine import (
    run_mono_calibration,
    run_stereo_calibration,
 )
 from calibrationclasses.cli_common import (
    add_board_args,
    add_session_args,
    add_troubleshooting_arg,
    build_board_config,
    parse_chessboard_size,
    resolve_input_path,
 )
 from calibrationclasses.feature_detection import DetectionConfig, run_detection
 from calibrationclasses.session import STEREO_PARTNERS
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Stereo camera calibration (2-step pipeline by default)"
    )
    add_session_args(parser)
    add_board_args(parser)
    parser.add_argument(
        "--step",
        choices=("detect", "calibrate", "all"),
        default="all",
        help="Pipeline step: detect JSONs, calibrate from JSONs, or both (default)",
    )
    parser.add_argument(
        "--legacy",
        action="store_true",
        help="Old one-shot flow: detect in memory, one stereo partner only",
    )
    parser.add_argument(
        "--left_camera",
        type=str,
        default="lc",
        choices=("lc", "lc-ir", "lc_ir"),
        help="Left camera folder for stereo (default: lc)",
    )
    parser.add_argument(
        "--right_camera",
        type=str,
        default="rc",
        choices=("rc", "rgb", "rg", "ir"),
        help="Stereo partner (legacy mode only; 2-step uses lc vs all partners)",
    )
    parser.add_argument(
        "--time_window",
        type=float,
        default=0.1,
        help="Stereo pair time window in seconds (default: 0.1)",
    )
    parser.add_argument(
        "--partners",
        type=str,
        default="rc,rg,ir",
        help="Stereo partners in 2-step mode (default: rc,rg,ir)",
    )
    parser.add_argument(
        "--ir_mode",
        choices=("auto", "chessboard", "ellipse"),
        default="auto",
        help="IR feature detection mode for step 1",
    )
    add_troubleshooting_arg(parser)
    return parser.parse_args()
 def run_legacy(
    input_path,
    chessboard_size=(8, 7),
    square_size=0.045,
    chessboard_size_left: Optional[Tuple[int, int]] = None,
    chessboard_size_right: Optional[Tuple[int, int]] = None,
    square_size_left: Optional[float] = None,
    square_size_right: Optional[float] = None,
    preprocessing="None",
    left_camera="lc",
    right_camera="rc",
    troubleshooting=False,
 ):
    chessboard_size_left = chessboard_size_left or chessboard_size
    chessboard_size_right = chessboard_size_right or chessboard_size
    square_size_left = square_size if square_size_left is None else square_size_left
    square_size_right = (
        square_size if square_size_right is None else square_size_right
    )
    stereo_calibrator = StereoCalibration(
        input_path,
        chessboard_size,
        square_size,
        preprocessing,
        chessboard_size_left=chessboard_size_left,
        chessboard_size_right=chessboard_size_right,
        square_size_left=square_size_left,
        square_size_right=square_size_right,
        left_camera=left_camera,
        right_camera=right_camera,
        troubleshooting=troubleshooting,
    )
    if stereo_calibrator._preprocessing_enabled():
        print(f"[INFO] Preprocessing for corner detection enabled: {preprocessing!r}")
    t1 = threading.Thread(target=stereo_calibrator.create_chessboard_points_left)
    t2 = threading.Thread(target=stereo_calibrator.create_chessboard_points_right)
    t1.start()
    t2.start()
    t1.join()
    t2.join()
    stereo_calibrator.build_pairs_cal()
    stereo_calibrator.calibrate()
    stereo_calibrator.save_stereo_calibration()
    if troubleshooting:
        stereo_calibrator.rectify_calibration_images()
 def run_two_step(args):
    input_path = resolve_input_path(args)
    board_sizes, square_sizes = build_board_config(args)
    per_camera_board = {
        name: {"board_size": board_sizes[name], "square_size": square_sizes[name]}
        for name in board_sizes
    }
    left_camera = args.left_camera.lower().replace("_", "-")
    partners = tuple(p.strip() for p in args.partners.split(",") if p.strip())
    if args.step in ("detect", "all"):
        print("\n=== Step 1: Feature detection → JSON ===")
        config_det = DetectionConfig(
            chessboard_size=args.chessboard_size,
            square_size=args.square_size,
            preprocessing=args.preprocessing,
            ir_mode=args.ir_mode,
            troubleshooting=args.troubleshooting,
        )
        run_detection(input_path, config_det, per_camera_board=per_camera_board)
    if args.step in ("calibrate", "all"):
        print("\n=== Step 2a: Mono intrinsics ===")
        mono_results = run_mono_calibration(
            input_path,
            board_sizes,
            square_sizes,
            troubleshooting=args.troubleshooting,
        )
        print("\n=== Step 2b: Stereo (lc vs partners) ===")
        run_stereo_calibration(
            input_path,
            left_camera=left_camera,
            mono_results=mono_results,
            board_sizes=board_sizes,
            square_sizes=square_sizes,
            time_window_sec=args.time_window,
            partners=partners or STEREO_PARTNERS,
            troubleshooting=args.troubleshooting,
        )
 if __name__ == "__main__":
    args = parse_args()
    input_path = str(resolve_input_path(args))
    if args.legacy:
        run_legacy(
            input_path=input_path,
            chessboard_size=args.chessboard_size,
            square_size=args.square_size,
            chessboard_size_left=args.left_chessboard_size,
            chessboard_size_right=args.right_chessboard_size,
            square_size_left=args.left_square_size,
            square_size_right=args.right_square_size,
            preprocessing=args.preprocessing,
            left_camera=args.left_camera,
            right_camera=args.right_camera,
            troubleshooting=args.troubleshooting,
        )
    else:
        run_two_step(args)
@@ -0,0 +1,8 @@
 # 02_Calibration — Python dependencies
 # Install: pip install -r requirements.txt
 # Full pipeline (all steps): pip install -r ~/Speckle-Scanner/requirements.txt
 numpy>=1.21
 opencv-python>=4.8
 tqdm>=4.0
 matplotlib>=3.5
@@ -0,0 +1,204 @@
 # 04 Rectification
 Stereo rectification for multi-camera scan sessions. Reads raw images from `3D-Scans`, applies calibration from `Calib-data`, and writes results into `Speckle-Scanner_Processing_data`.
 Supported stereo pairs (per scan):
 | Pair | Left | Right | Params file |
 |------|------|-------|-------------|
 | `lc-rc` | `lc_*` | `rc_*` | `lc-rc_parameters.npz` |
 | `lc-rg` | `lc_*` | `rg_*` | `lc-rg_parameters.npz` |
 | `lc-ir` | `lc_*` | `ir_*` | `lc-ir_parameters.npz` |
 Rectified LC frames are taken from the `lc-rc` run only (one LC set in `02_rect_images`). Partner cameras (`rc`, `rg`, `ir`) are saved from their own pair calibration.
 ---
 ## Folder layout (general paths)
 All paths use `$HOME` — replace with your home directory on any machine.
 | Role | Path pattern |
 |------|----------------|
 | Source scans (RAW) | `$HOME/3D-Scans/<raw_project>/<date>/sessionN/Scan00000X/01_raw_images/` |
 | Calibration params | `$HOME/Calib-data/<project>/<date>/<calib_name>/params/` |
 | Processing output | `$HOME/Speckle-Scanner_Processing_data/<project>/<date>/` |
 Example naming:
 - **project** (Calib + processing): `Olsen_wings` (underscore)
 - **raw_project** (3D-Scans): `Olsen-wings` (often hyphen; default = project with `_` → `-`)
 - **date**: `2026-05-12`
 - **calib_name**: `calib1`
 Per session under processing output:
 ```text
 $HOME/Speckle-Scanner_Processing_data/<project>/<date>/
  session53/
    params_link/          # copied lc-rc, lc-rg, lc-ir params
    Scan000001/
      01_raw_images/      # copy of source images
      02_rect_images/       # rectified lc_*, rc_*, rg_*, ir_* (single folder)
    Scan000002/
      ...
 ```
 Source side (same session/scan names):
 ```text
 $HOME/3D-Scans/<raw_project>/<date>/session53/Scan000001/01_raw_images/
 ```
 Calibration params (once per project/date):
 ```text
 $HOME/Calib-data/<project>/<date>/<calib_name>/params/
  lc-rc_parameters.npz
  lc-rc_stereo_cam_model.yaml
  lc-rc_Q.cvstore
  lc-rg_*
  lc-ir_*
 ```
 ---
 ## Requirements
 ```bash
 pip install numpy opencv-python tqdm
 ```
 Use a Python environment where `import cv2` works.
 ---
 ## How to run
 From anywhere:
 ```bash
 cd "$HOME/Speckle-Scanner/04_Rectification"
 python main.py [options]
 ```
 ### All sessions under one date (full batch)
 Processes every `session*/Scan*/01_raw_images` under the date folder.
 ```bash
 python main.py \
  --project Olsen_wings \
  --raw_project Olsen-wings \
  --date 2026-05-12 \
  --calib_name calib1
 ```
 One line:
 ```bash
 python main.py --project Olsen_wings --raw_project Olsen-wings --date 2026-05-12 --calib_name calib1
 ```
 ### One session only
 ```bash
 python main.py \
  --project Olsen_wings \
  --raw_project Olsen-wings \
  --date 2026-05-12 \
  --calib_name calib1 \
  --session session53
 ```
 ### Custom pairs
 Default: `lc-rc,lc-rg,lc-ir`. Example — RC and IR only:
 ```bash
 python main.py --project Olsen_wings --date 2026-05-12 --pairs lc-rc,lc-ir
 ```
 ### Override paths (any project/machine)
 ```bash
 python main.py \
  --source_date_root "$HOME/3D-Scans/MyProject/2026-05-12" \
  --calib_params_dir "$HOME/Calib-data/MyProject/2026-05-12/calib1/params" \
  --processing_date_root "$HOME/Speckle-Scanner_Processing_data/MyProject/2026-05-12"
 ```
 ---
 ## CLI reference
 | Option | Default | Meaning |
 |--------|---------|---------|
 | `--project` | **required** | Project name in Calib-data and Processing_data (e.g. `Olsen_wings`) |
 | `--date` | **required** | Date subfolder (e.g. `2026-05-12`) |
 | `--raw_project` | `<project>` with `_` → `-` | Project folder name under 3D-Scans |
 | `--session` | (all) | Only this session, e.g. `session53` |
 | `--calib_name` | `calib1` | Calibration run folder |
 | `--pairs` | `lc-rc,lc-rg,lc-ir` | Comma-separated stereo pairs |
 | `--keep_lc_from_pair` | `lc-rc` | Which pair defines rectified LC in `02_rect_images` |
 | `--source_date_root` | auto | Override RAW scan root |
 | `--calib_params_dir` | auto | Override params folder |
 | `--processing_date_root` | auto | Override output root |
 ---
 ## Pairing notes
 Images are matched by filename key (in order):
 1. `_ts<number>` in both names (e.g. `lc_ts254303092_...` ↔ `rc_ts254303092_...`)
 2. `scan000001` style / `IR_scan_000001`
 3. Prefix + suffix (`lc_123` ↔ `ir_123`)
 If no key match for `lc-rg` or `lc-ir`, the script may use **index fallback** (first LC with first RG/IR). Check logs for:
 ```text
 [WARN] No key match for lc-rg; using index fallback with N pairs.
 ```
 `lc-rc` usually matches on `_ts` when both cameras captured the same timestamps.
 ---
 ## What gets created
 For each processed scan:
 - Copies `01_raw_images` into processing tree (does not delete source RAW data)
 - Writes rectified images to `02_rect_images/`
 - Creates `params_link/` once per session with all calibration files
 Does **not** modify files under `3D-Scans`.
 ---
 ## Quick check after a run
 ```bash
 PROJECT=Olsen_wings
 DATE=2026-05-12
 SESSION=session53
 SCAN=Scan000001
 ls "$HOME/Speckle-Scanner_Processing_data/$PROJECT/$DATE/$SESSION/params_link"
 ls "$HOME/Speckle-Scanner_Processing_data/$PROJECT/$DATE/$SESSION/$SCAN/02_rect_images" | head
 ```
 ---
 ## Dependencies
 ```bash
 # This step only
 pip install -r ~/Speckle-Scanner/04_Rectification/requirements.txt
 # Or install everything for the full pipeline
 pip install -r ~/Speckle-Scanner/requirements.txt
 ```
 Packages: `numpy`, `opencv-python`, `tqdm`.
@@ -0,0 +1,85 @@
 import argparse
 from pathlib import Path
 from rectificationclasses.rectification import Rectification
 def parse_args():
    parser = argparse.ArgumentParser(description="Batch stereo rectification")
    parser.add_argument("--project", type=str, required=True, help="Project name used for Calib-data and processing_data (e.g. Olsen_wings)")
    parser.add_argument(
        "--raw_project",
        type=str,
        default=None,
        help="Project name used in 3D-Scans (default: project with '_' replaced by '-')",
    )
    parser.add_argument("--date", type=str, required=True, help="Date folder (e.g. 2026-05-12)")
    parser.add_argument(
        "--session",
        type=str,
        default=None,
        help="Process only this session folder (e.g. session53). Default: all sessions under the date.",
    )
    parser.add_argument("--calib_name", type=str, default="calib1", help="Calibration folder under Calib-data/<project>/<date>/")
    parser.add_argument(
        "--pairs",
        type=str,
        default="lc-rc,lc-rg,lc-ir",
        help="Comma-separated pair list, e.g. lc-rc,lc-rg,lc-ir",
    )
    parser.add_argument(
        "--keep_lc_from_pair",
        type=str,
        default="lc-rc",
        help="Pair whose rectified LC frames are kept in 02_rect_images.",
    )
    parser.add_argument(
        "--source_date_root",
        type=str,
        default=None,
        help="Override source root (default: ~/3D-Scans/<raw_project>/<date>)",
    )
    parser.add_argument(
        "--calib_params_dir",
        type=str,
        default=None,
        help="Override calib params dir (default: ~/Calib-data/<project>/<date>/<calib_name>/params)",
    )
    parser.add_argument(
        "--processing_date_root",
        type=str,
        default=None,
        help="Override processing target root (default: ~/Speckle-Scanner_Processing_data/<project>/<date>)",
    )
    return parser.parse_args()
 def main():
    args = parse_args()
    home = Path.home()
    raw_project = args.raw_project or args.project.replace("_", "-")
    pairs = tuple([p.strip() for p in args.pairs.split(",") if p.strip()])
    source_date_root = Path(args.source_date_root) if args.source_date_root else (
        home / "3D-Scans" / raw_project / args.date
    )
    calib_params_dir = Path(args.calib_params_dir) if args.calib_params_dir else (
        home / "Calib-data" / args.project / args.date / args.calib_name / "params"
    )
    processing_date_root = Path(args.processing_date_root) if args.processing_date_root else (
        home / "Speckle-Scanner_Processing_data" / args.project / args.date
    )
    rectificator = Rectification(
        source_date_root=str(source_date_root),
        calib_params_dir=str(calib_params_dir),
        processing_date_root=str(processing_date_root),
        pairs=pairs,
        keep_lc_from_pair=args.keep_lc_from_pair,
        session_filter=args.session,
    )
    rectificator.run_batch()
 if __name__ == "__main__":
    main()
@@ -0,0 +1,333 @@
 from pathlib import Path
 import re
 import shutil
 from typing import Dict, List, Optional, Tuple
 import cv2
 import numpy as np
 from tqdm import tqdm
 VALID_EXTS = {".bmp", ".png", ".jpg", ".jpeg"}
 class Rectification:
    """Batch rectification for one project/date tree.
    Reads source scans from RAW data tree, copies scans into processing tree, and
    rectifies lc-rc/lc-rg/lc-ir pairs with pair-specific calibration params.
    """
    def __init__(
        self,
        source_date_root: str,
        calib_params_dir: str,
        processing_date_root: str,
        pairs: Tuple[str, ...] = ("lc-rc", "lc-rg", "lc-ir"),
        keep_lc_from_pair: str = "lc-rc",
        session_filter: Optional[str] = None,
    ) -> None:
        self.source_date_root = Path(source_date_root)
        self.calib_params_dir = Path(calib_params_dir)
        self.processing_date_root = Path(processing_date_root)
        self.pairs = pairs
        self.keep_lc_from_pair = keep_lc_from_pair
        self.session_filter = session_filter
        if not self.source_date_root.is_dir():
            raise FileNotFoundError(f"Source date root not found: {self.source_date_root}")
        if not self.calib_params_dir.is_dir():
            raise FileNotFoundError(f"Calibration params dir not found: {self.calib_params_dir}")
        self.processing_date_root.mkdir(parents=True, exist_ok=True)
        self._params_by_pair: Dict[str, Dict[str, np.ndarray]] = {}
        self._rect_maps_cache: Dict[Tuple[str, int, int], Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray], np.ndarray]] = {}
        self._load_all_pair_params()
    @staticmethod
    def _extract_ts_key(filename: str) -> Optional[str]:
        stem = Path(filename).stem.lower()
        m = re.search(r"_ts(\d+)", stem)
        return m.group(1) if m else None
    @staticmethod
    def _extract_scan_key(filename: str) -> Optional[str]:
        stem = Path(filename).stem.lower()
        m = re.search(r"(scan\d{6})", stem)
        if m:
            return m.group(1)
        m = re.match(r"^ir_scan_(\d+)", stem)
        if m:
            return f"scan{int(m.group(1)):06d}"
        m = re.match(r"^ir_(\d{6})(?:_|$)", stem)
        if m:
            return f"scan{m.group(1)}"
        return None
    @staticmethod
    def _extract_generic_suffix_key(filename: str, prefix: str) -> Optional[str]:
        stem = Path(filename).stem.lower()
        if not stem.startswith(prefix):
            return None
        return stem[len(prefix):].lstrip("_-.")
    @staticmethod
    def _camera_from_pair(pair_name: str) -> str:
        return pair_name.split("-", 1)[1]
    def _load_pair_params(self, pair_name: str) -> Dict[str, np.ndarray]:
        npz_path = self.calib_params_dir / f"{pair_name}_parameters.npz"
        if not npz_path.exists():
            raise FileNotFoundError(f"Missing params file for {pair_name}: {npz_path}")
        data = np.load(npz_path, allow_pickle=True)
        params = dict(data)
        required = [
            "L_Intrinsic",
            "L_Distortion",
            "R_Intrinsic",
            "R_Distortion",
            "Rotation",
            "Translation",
        ]
        missing = [k for k in required if k not in params]
        if missing:
            raise KeyError(f"{pair_name} params missing keys: {missing}")
        return params
    def _load_all_pair_params(self) -> None:
        for pair_name in self.pairs:
            self._params_by_pair[pair_name] = self._load_pair_params(pair_name)
        print(f"[INFO] Loaded calibration params for pairs: {', '.join(self.pairs)}")
    def _copy_params_link_for_session(self, session_name: str) -> None:
        target_params = self.processing_date_root / session_name / "params_link"
        target_params.mkdir(parents=True, exist_ok=True)
        for src in self.calib_params_dir.iterdir():
            if src.is_file() and src.suffix.lower() in (".npz", ".yaml", ".cvstore"):
                shutil.copy2(src, target_params / src.name)
    @staticmethod
    def _copy_raw_images(src_raw_dir: Path, dst_raw_dir: Path) -> None:
        dst_raw_dir.mkdir(parents=True, exist_ok=True)
        for src in src_raw_dir.iterdir():
            if src.is_file():
                shutil.copy2(src, dst_raw_dir / src.name)
    @staticmethod
    def _list_images(raw_dir: Path, prefix: str) -> List[Path]:
        imgs = [
            p for p in raw_dir.iterdir()
            if p.is_file()
            and p.suffix.lower() in VALID_EXTS
            and p.name.lower().startswith(prefix.lower())
        ]
        imgs.sort()
        return imgs
    def _pair_images(self, left_images: List[Path], right_images: List[Path], right_camera: str) -> List[Tuple[Path, Path]]:
        left_by_ts = {self._extract_ts_key(p.name): p for p in left_images if self._extract_ts_key(p.name)}
        right_by_ts = {self._extract_ts_key(p.name): p for p in right_images if self._extract_ts_key(p.name)}
        pairs: List[Tuple[Path, Path]] = []
        common_ts = sorted(set(left_by_ts.keys()) & set(right_by_ts.keys()))
        for ts in common_ts:
            pairs.append((left_by_ts[ts], right_by_ts[ts]))
        if pairs:
            return pairs
        left_by_scan = {self._extract_scan_key(p.name): p for p in left_images if self._extract_scan_key(p.name)}
        right_by_scan = {self._extract_scan_key(p.name): p for p in right_images if self._extract_scan_key(p.name)}
        common_scan = sorted(set(left_by_scan.keys()) & set(right_by_scan.keys()))
        for skey in common_scan:
            pairs.append((left_by_scan[skey], right_by_scan[skey]))
        if pairs:
            return pairs
        left_by_suffix = {
            self._extract_generic_suffix_key(p.name, "lc"): p
            for p in left_images
            if self._extract_generic_suffix_key(p.name, "lc")
        }
        right_by_suffix = {
            self._extract_generic_suffix_key(p.name, right_camera): p
            for p in right_images
            if self._extract_generic_suffix_key(p.name, right_camera)
        }
        common_suffix = sorted(set(left_by_suffix.keys()) & set(right_by_suffix.keys()))
        for key in common_suffix:
            pairs.append((left_by_suffix[key], right_by_suffix[key]))
        if pairs:
            return pairs
        fallback_count = min(len(left_images), len(right_images))
        if fallback_count > 0:
            print(
                f"[WARN] No key match for lc-{right_camera}; "
                f"using index fallback with {fallback_count} pairs."
            )
            return list(zip(left_images[:fallback_count], right_images[:fallback_count]))
        return []
    def _get_rectification_maps(
        self,
        pair_name: str,
        left_size: Tuple[int, int],
        right_size: Tuple[int, int],
    ) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray], np.ndarray]:
        cache_key = (pair_name, left_size[0], left_size[1])
        if cache_key in self._rect_maps_cache:
            return self._rect_maps_cache[cache_key]
        params = self._params_by_pair[pair_name]
        rect_left, rect_right, proj_left, proj_right, q_mat, _, _ = cv2.stereoRectify(
            params["L_Intrinsic"],
            params["L_Distortion"],
            params["R_Intrinsic"],
            params["R_Distortion"],
            left_size,
            params["Rotation"],
            params["Translation"],
            alpha=1,
            flags=0,
        )
        left_maps = cv2.initUndistortRectifyMap(
            params["L_Intrinsic"],
            params["L_Distortion"],
            rect_left,
            proj_left,
            left_size,
            cv2.CV_32FC1,
        )
        right_maps = cv2.initUndistortRectifyMap(
            params["R_Intrinsic"],
            params["R_Distortion"],
            rect_right,
            proj_right,
            right_size,
            cv2.CV_32FC1,
        )
        self._rect_maps_cache[cache_key] = (left_maps, right_maps, q_mat)
        return left_maps, right_maps, q_mat
    def _rectify_pair_image(
        self,
        pair_name: str,
        left_img: np.ndarray,
        right_img: np.ndarray,
    ) -> Tuple[np.ndarray, np.ndarray]:
        left_size = (left_img.shape[1], left_img.shape[0])
        right_size = (right_img.shape[1], right_img.shape[0])
        left_maps, right_maps, _ = self._get_rectification_maps(pair_name, left_size, right_size)
        left_rect = cv2.remap(left_img, left_maps[0], left_maps[1], cv2.INTER_AREA)
        right_rect = cv2.remap(right_img, right_maps[0], right_maps[1], cv2.INTER_AREA)
        return left_rect, right_rect
    def _process_scan(self, session_name: str, scan_name: str) -> Dict[str, int]:
        src_raw_dir = self.source_date_root / session_name / scan_name / "01_raw_images"
        dst_scan_dir = self.processing_date_root / session_name / scan_name
        dst_raw_dir = dst_scan_dir / "01_raw_images"
        dst_rect_dir = dst_scan_dir / "02_rect_images"
        dst_rect_dir.mkdir(parents=True, exist_ok=True)
        self._copy_raw_images(src_raw_dir, dst_raw_dir)
        stats = {"pairs_total": 0, "saved": 0, "skipped": 0}
        lc_written = False
        ordered_pairs = list(self.pairs)
        if self.keep_lc_from_pair in ordered_pairs:
            ordered_pairs.remove(self.keep_lc_from_pair)
            ordered_pairs.insert(0, self.keep_lc_from_pair)
        for pair_name in ordered_pairs:
            right_camera = self._camera_from_pair(pair_name)
            left_images = self._list_images(dst_raw_dir, "lc")
            right_images = self._list_images(dst_raw_dir, right_camera)
            if not left_images or not right_images:
                stats["skipped"] += 1
                print(
                    f"[WARN] {session_name}/{scan_name} {pair_name}: "
                    f"missing images (lc={len(left_images)}, {right_camera}={len(right_images)})."
                )
                continue
            pairs = self._pair_images(left_images, right_images, right_camera)
            if not pairs:
                stats["skipped"] += 1
                print(f"[WARN] {session_name}/{scan_name} {pair_name}: no valid pairs.")
                continue
            save_lc_this_pair = (
                pair_name == self.keep_lc_from_pair
                or (not lc_written and pair_name != self.keep_lc_from_pair)
            )
            stats["pairs_total"] += len(pairs)
            for left_path, right_path in tqdm(
                pairs,
                desc=f"{session_name}/{scan_name} {pair_name}",
                unit="pair",
                leave=False,
            ):
                left_img = cv2.imread(str(left_path), cv2.IMREAD_COLOR)
                right_img = cv2.imread(str(right_path), cv2.IMREAD_COLOR)
                if left_img is None or right_img is None:
                    stats["skipped"] += 1
                    continue
                left_rect, right_rect = self._rectify_pair_image(pair_name, left_img, right_img)
                if save_lc_this_pair:
                    left_out = dst_rect_dir / left_path.name
                    cv2.imwrite(str(left_out), left_rect)
                    lc_written = True
                right_out = dst_rect_dir / right_path.name
                cv2.imwrite(str(right_out), right_rect)
                stats["saved"] += 1
        return stats
    def _discover_session_scan_raw_dirs(self) -> List[Tuple[str, str]]:
        found: List[Tuple[str, str]] = []
        session_dirs = sorted(
            [p for p in self.source_date_root.iterdir() if p.is_dir() and p.name.lower().startswith("session")]
        )
        for session_dir in session_dirs:
            if self.session_filter and session_dir.name != self.session_filter:
                continue
            scan_dirs = sorted(
                [p for p in session_dir.iterdir() if p.is_dir() and p.name.lower().startswith("scan")]
            )
            for scan_dir in scan_dirs:
                raw_dir = scan_dir / "01_raw_images"
                if raw_dir.is_dir():
                    found.append((session_dir.name, scan_dir.name))
        return found
    def run_batch(self) -> Dict[str, int]:
        all_scans = self._discover_session_scan_raw_dirs()
        if not all_scans:
            raise RuntimeError(f"No scan folders found under {self.source_date_root}")
        print(f"[INFO] Found {len(all_scans)} scans under {self.source_date_root}")
        totals = {"scans": 0, "pairs_total": 0, "saved": 0, "skipped": 0}
        sessions_seen = set()
        for session_name, scan_name in all_scans:
            if session_name not in sessions_seen:
                self._copy_params_link_for_session(session_name)
                sessions_seen.add(session_name)
            scan_stats = self._process_scan(session_name, scan_name)
            totals["scans"] += 1
            totals["pairs_total"] += scan_stats["pairs_total"]
            totals["saved"] += scan_stats["saved"]
            totals["skipped"] += scan_stats["skipped"]
        print(
            "[INFO] Batch rectification finished: "
            f"scans={totals['scans']} pairs={totals['pairs_total']} "
            f"saved={totals['saved']} skipped={totals['skipped']}"
        )
        return totals
@@ -0,0 +1,7 @@
 # 04_Rectification — Python dependencies
 # Install: pip install -r requirements.txt
 # Full pipeline (all steps): pip install -r ~/Speckle-Scanner/requirements.txt
 numpy>=1.21
 opencv-python>=4.8
 tqdm>=4.0
@@ -0,0 +1,2 @@
 include/libsgm_config.h
 build/
@@ -0,0 +1,66 @@
 image: adaskit/libsgm:0.3-opencv4
 variables:
  GIT_SUBMODULE_STRATEGY: recursive
 stages:
  - build
  - test
 .build_template: &build_definition
  stage: build
  tags:
    - docker
  script:
    - ldconfig
    - cmake . -DBUILD_OPENCV_WRAPPER="ON" -DENABLE_SAMPLES=${build_samples} -DLIBSGM_SHARED=${build_shared} -DENABLE_TESTS=${build_tests}
    - make
 build:samples_on:shared:
  variables:
    build_samples: "ON"
    build_shared: "ON"
    build_tests: "OFF"
  <<: *build_definition
 build:samples_on:static:
  variables:
    build_samples: "ON"
    build_shared: "OFF"
    build_tests: "OFF"
  <<: *build_definition
 build:samples_off:shared:
  variables:
    build_samples: "OFF"
    build_shared: "ON"
    build_tests: "OFF"
  <<: *build_definition
 build:samples_off:static:
  variables:
    build_samples: "OFF"
    build_shared: "OFF"
    build_tests: "OFF"
  <<: *build_definition
 build:test:
  variables:
    build_samples: "OFF"
    build_shared: "OFF"
    build_tests: "ON"
  artifacts:
    paths:
      - ./test/sgm-test
    expire_in: 1d
  <<: *build_definition
 test:
  stage: test
  tags:
    - nvidia-docker
  script:
    - ldconfig
    - cuda-memcheck --leak-check full ./test/sgm-test
  dependencies:
    - build:test
@@ -0,0 +1,3 @@
 [submodule "test/googletest"]
 	path = test/googletest
 	url = https://github.com/google/googletest.git
@@ -0,0 +1,28 @@
 cmake_minimum_required(VERSION 3.18)
 option(ENABLE_ZED_DEMO      "Build a Demo using ZED Camera" OFF)
 option(ENABLE_SAMPLES       "Build samples" OFF)
 option(ENABLE_TESTS         "Test library" OFF)
 option(LIBSGM_SHARED        "Build a shared library" OFF)
 option(BUILD_OPENCV_WRAPPER "Make library compatible with cv::Mat and cv::cuda::GpuMat of OpenCV" OFF)
 if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
    set(CMAKE_CUDA_ARCHITECTURES "52;61;72;75;86")
 endif()
 project(libSGM VERSION 3.1.0)
 configure_file(
 ${PROJECT_SOURCE_DIR}/include/libsgm_config.h.in
 ${PROJECT_SOURCE_DIR}/include/libsgm_config.h
 )
 add_subdirectory(src)
 if(ENABLE_SAMPLES)
 	add_subdirectory(sample)
 endif()
 if(ENABLE_TESTS)
 	add_subdirectory(test)
 endif()
@@ -0,0 +1,33 @@
 ###############################################################################
 # Find LibSGM
 #
 # This sets the following variables:
 # LIBSGM_FOUND - True if LIBSGM was found.
 # LIBSGM_INCLUDE_DIRS - Directories containing the LIBSGM include files.
 # LIBSGM_LIBRARY - Libraries needed to use LIBSGM.
 # Find lib
 set(LIBSGM_FOUND FALSE CACHE BOOL "" FORCE)
 find_library(LIBSGM_LIBRARY
    NAMES sgm libsgm
    PATH_SUFFIXES lib/
 )
 # Find include
 find_path(LIBSGM_INCLUDE_DIRS
    NAMES libsgm.h
    PATH_SUFFIXES include/
 )
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(LibSGM DEFAULT_MSG LIBSGM_LIBRARY LIBSGM_INCLUDE_DIRS)
 message(STATUS "(LIBSGM_FOUND : ${LIBSGM_FOUND} include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})")
 mark_as_advanced(LIBSGM_FOUND)
 if(LIBSGM_FOUND)
    set(LIBSGM_FOUND TRUE CACHE BOOL "" FORCE)
    set(LIBSGM_LIBRARIES ${LIBSGM_LIBRARY})
    message(STATUS "LibSGM found ( include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})")
 endif()
@@ -0,0 +1,202 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   1. Definitions.
      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.
      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.
      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.
      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.
      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.
      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.
      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).
      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.
      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."
      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.
   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.
   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.
   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:
      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and
      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and
      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and
      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.
      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.
   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.
   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.
   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.
   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.
   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.
   END OF TERMS AND CONDITIONS
   APPENDIX: How to apply the Apache License to your work.
      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
   Copyright [yyyy] [name of copyright owner]
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
       http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
@@ -0,0 +1,335 @@
 # installation process for US:
 Install Anaconda and CUDA Toolkit (compute capability >= 3.5)
 Check if cmake is installed.
 ```
 $ cmake --version
 ```
 if version is <3.18
 ```
 $ sudo apt remove cmake #Only if cmake is installed with <3.18 version
 $ wget https://github.com/Kitware/CMake/releases/download/v3.21.5/cmake-3.21.5.tar.gz
 $ tar -xzvf cmake-3.21.5.tar.gz
 $ cd cmake-3.21.5
 $ ./bootstrap
 $ make
 $ sudo make install
 $ cmake --version
 ```
 Now if it is giving error of not found
 ```
 $ find /usr/local/bin -name cmake
 ```
 if path exists then close the terminal and open new terminal then again check the version.
 ```
 $ cmake --version
 ```
 ## Environment:
 Create an environment (named libsgm) in conda
 ```
 $ conda create --name libsgm
 $ conda activate libsgm
 ```
 Installing Fixstars LibSGM:
 ```
 $ git clone https://gitea.subseascanning.com/dejhost/libSGM.git
 $ cd libSGM
 $ git submodule update --init
 $ mkdir build  
 $ cd build
 $ cmake ../
 $ make
 ```
 ## Sample Execution
 ```
 $ pwd
 .../libSGM
 $ cd build
 $ cmake .. -DENABLE_SAMPLES=on
 $ make
 $ cd sample
 ```
 place /data folder in libSGM/build/sample/data.
 Now run the command once to confirm the installation and working of LibSGM.
 For single image pair you use this stereosgm_new file
 The disparity map will be saved on the same path which contains the executeable stereosgm_new file (.../libSGM/build/sample)
 ```
 $ ./stereosgm_new data/lc00012.bmp data/rc00012.bmp
 ```
 For multiple pairs one after another you can use stereosgm_image it will save disparity.xml files in output directory.
 ```
 $ ./stereosgm_image data/lc%05d.bmp data/rc%05d.bmp
 ```
 ---
 ## **Pipeline Usage (Automated Path Resolution)**
 Use `run_sgm_pipeline.py` to run libSGM across the project folder structure automatically.
 It picks the **last rectified image pair** (highest timestamp) from each scan's `02_rect_images/` folder,
 runs `stereosgm_new`, and saves results to `03_sgm_disp_map/`.
 ### **Folder structure assumed**
 ```
 ~/Speckle-Scanner_Processing_data/
 └── <project>/
    └── <date>/
        └── <session>/
            └── <ScanXXXXXX>/
                ├── 02_rect_images/     ← lc_ts<last>.png + rc_ts<same>.png (input)
                ├── 03_sgm_disp_map/    ← disparity.xml + disparity_color.png (created)
                └── 05_sgm_pcl/         ← untouched
 ```
 Pairs are matched on the shared `ts` token (e.g. `ts1634840093`). Both formats work:
 `lc_ts1634840093_ck….png` / `rc_ts1634840093_ck….png` and `lc_ts1634840093.png` / `rc_ts1634840093.png`.
 ### **Commands**
 ```bash
 cd ~/Speckle-Scanner/05_disparity/libsgm
 # Process ALL scans in a session
 python run_sgm_pipeline.py \
  --project Olsen_wings \
  --date    2026-05-12 \
  --session session1
 # Process ALL sessions on a date (omit --session)
 python run_sgm_pipeline.py \
  --project Olsen_wings \
  --date    2026-05-12
 # Process a SINGLE scan
 python run_sgm_pipeline.py \
  --project Olsen_wings \
  --date    2026-05-12 \
  --session session1 \
  --scan    Scan000001
 # Custom SGM parameters
 python run_sgm_pipeline.py \
  --project    Olsen_wings \
  --date       2026-05-12 \
  --session    session1 \
  --disp_size  128 \
  --P1         8 \
  --P2         32 \
  --min_disp   0 \
  --num_paths  8 \
  --census_type 1
 ```
 ### **Pipeline parameters**
 | Parameter       | Default | Description                                                                      |
 |-----------------|---------|----------------------------------------------------------------------------------|
 | `--project`     | —       | Project name (e.g. `Olsen_wings`)                                                |
 | `--date`        | —       | Date string (e.g. `2026-05-12`)                                                  |
 | `--session`     | all     | Session name (e.g. `session1`); omit to process **all sessions** on that date    |
 | `--scan`        | all     | Single scan (e.g. `Scan000001`); omit to process all scans in the session        |
 | `--disp_size`   | `256`   | Maximum disparity value (64, 128, or 256)                                        |
 | `--P1`          | `10`    | SGM penalty for disparity change of ±1                                           |
 | `--P2`          | `120`   | SGM penalty for disparity change > 1                                             |
 | `--uniqueness`  | `0.80`  | Uniqueness ratio threshold                                                        |
 | `--num_paths`   | `8`     | Scanlines for cost aggregation (4 or 8)                                          |
 | `--min_disp`    | `-160`  | Minimum disparity value                                                           |
 | `--LR_max_diff` | `1`     | Maximum allowed left-right disparity difference                                  |
 | `--census_type` | `1`     | Census transform type: 0=CENSUS_9x7, 1=SYMMETRIC_CENSUS_9x7                     |
 ### **What gets saved in `03_sgm_disp_map/`**
 | File | Description |
 |------|-------------|
 | `disparity.xml` | Raw disparity matrix (OpenCV FileStorage format, CV_16S) |
 | `disparity_color.png` | Colorized disparity image (TURBO colormap, 8-bit) |
 ---
 ## **Direct Binary Usage**
 Run `stereosgm_new` manually with explicit paths (must run from the build/sample directory or use full paths):
 ```bash
 cd ~/Speckle-Scanner/05_disparity/libsgm/build/sample
 # Default parameters, save to current directory
 ./stereosgm_new data/lc00012.bmp data/rc00012.bmp
 # Save to a specific output folder, no display window
 ./stereosgm_new \
  /path/to/lc_image.png \
  /path/to/rc_image.png \
  --output_dir=/path/to/03_sgm_disp_map \
  --no_display=1 \
  --disp_size=128 --P1=8 --P2=32
 ```
 ---
 ## **Available Parameters**
 | Parameter            | Default Value  | Description                                                                |
 | -------------------- | -------------- | -------------------------------------------------------------------------- |
 | `@left-image-format` | `none`         | Format string for the path to input left image (e.g., "left/img_%04d.png") |
 | `@right-image-format`| `none`         | Format string for the path to input right image                            |
 | `--disp_size`        | `256`          | Maximum possible disparity value                                           |
 | `--P1`               | `10`           | Penalty for disparity change of ±1                                         |
 | `--P2`               | `120`          | Penalty for disparity change > 1                                           |
 | `--uniqueness`       | `0.80`         | Margin ratio for uniqueness constraint                                     |
 | `--num_paths`        | `8`            | Number of scanlines used in cost aggregation (4 or 8)                      |
 | `--min_disp`         | `-160`         | Minimum disparity value                                                    |
 | `--LR_max_diff`      | `1`            | Maximum allowed left-right disparity difference                            |
 | `--census_type`      | `1`            | Census transform type (0: 5x5, 1: 9x7, 2: 11x9)                            |
 | `--interval`         | `1`            | Polling interval (in seconds) for checking new stereo image pairs          |
 | `--output_dir`       | `.`            | Directory to save `disparity.xml` and `disparity_color.png`                |
 | `--no_display`       | `0`            | Set to `1` to skip interactive display window (required for pipeline/headless use) |
 | `--help or -h`       |                | Show help message                                                          |
 ### **Custom Parameters** 
 You can override any parameter through command-line arguments. Below is an example with some customized parameters:
 ```bash
 ./stereosgm_image data/lc%05d.bmp data/rc%05d.bmp \
  --disp_size=128 --P1=8 --P2=32 --interval=2
 ```
 # libSGM(Orignal)
 ---
 A CUDA implementation performing Semi-Global Matching.
 ## Introduction
 ---
 libSGM is library that implements in CUDA the Semi-Global Matching algorithm.  
 From a pair of appropriately calibrated input images, we can obtain the disparity map.
 ## Features
 ---
 Because it uses CUDA, we can compute the disparity map at high speed.
 ## Performance
 The libSGM performance obtained from benchmark sample
 ### Settings
 - image size : 1024 x 440
 - disparity size : 128
 - sgm path : 4 path
 - subpixel : enabled
 ### Results
 |Device|CUDA version|Processing Time[Milliseconds]|FPS|
 |---|---|---|---|
 |GTX 1080 Ti|10.1|2.0|495.1|
 |GeForce RTX 3080|11.1|1.5|651.3|
 |Tegra X2|10.0|28.5|35.1|
 |Xavier(MODE_15W)|10.2|17.3|57.7|
 |Xavier(MAXN)|10.2|9.0|110.7|
 ## Requirements
 |Package Name|Minimum Requirements|Note
 |---|---|---|
 |CMake|version >= 3.18||
 |CUDA Toolkit|compute capability >= 3.5|
 |OpenCV|version >= 3.4.8|for samples|
 |OpenCV CUDA module|version >= 3.4.8|for OpenCV wrapper|
 |ZED SDK|version >= 3.0|for ZED sample|
 ## Build Instructions
 ```
 $ git clone https://github.com/fixstars/libSGM.git
 $ cd libSGM
 $ git submodule update --init  # It is needed if ENABLE_TESTS option is set to ON
 $ mkdir build
 $ cd build
 $ cmake ../  # Several options available
 $ make
 ```
 ## Sample Execution
 ```
 $ pwd
 .../libSGM
 $ cd build
 $ cmake .. -DENABLE_SAMPLES=on
 $ make
 $ cd sample
 $ ./stereosgm_movie <left image path format> <right image path format> <disparity_size>
 left image path format: the format used for the file paths to the left input images
 right image path format: the format used for the file paths to the right input images
 disparity_size: the maximum number of disparities (optional)
 ```
 "disparity_size" is optional. By default, it is 128.
 Next, we explain the meaning of the "left image path format" and "right image path format".  
 When provided with the following set of files, we should pass the "path formats" given below.
 ```
 left_image_0000.pgm
 left_image_0001.pgm
 left_image_0002.pgm
 left_image_0003.pgm
 ...
 right_image_0000.pgm
 right_image_0001.pgm
 right_image_0002.pgm
 right_image_0003.pgm
 ```
 ```
 $ ./stereosgm_movie left_image_%04d.pgm right_image_%04d.pgm
 ```
 The sample images available at [Daimler Urban Scene Segmentation Benchmark Dataset 2014](http://www.6d-vision.com/scene-labeling) are used to test the software.
 ## Test Execution
 libSGM uses [Google Test](https://github.com/google/googletest) for tests as Git submodule.  
 So, we need to init submodule by following command firstly.
 ```
 $ pwd
 .../libSGM
 $ git submodule update --init
 ```
 We can run tests after a build.
 ```
 $ pwd
 .../libSGM
 $ cd build
 $ cd test
 $ ./sgm-test
 ```
 Test code compares our implementation of each functions to naive implementation.
 ## Python pipeline runner dependencies
 `run_sgm_pipeline.py` uses only the Python standard library. Disparity is computed by the compiled `stereosgm_new` binary.
 See `requirements.txt` in this folder for system build requirements (CUDA, CMake, OpenCV C++).
 ```bash
 # No pip packages needed for the Python runner.
 # Build the binary first (see above), then:
 cd ~/Speckle-Scanner/05_disparity/libsgm
 python run_sgm_pipeline.py --project <project> --date <date>
 ```
 ## Author
 The "adaskit Team"  
 The adaskit is an open-source project created by [Fixstars Corporation](https://www.fixstars.com/) and its subsidiary companies including [Fixstars Autonomous Technologies](https://at.fixstars.com/), aimed at contributing to the ADAS industry by developing high-performance implementations for algorithms with high computational cost.
 ## License
 Apache License 2.0
@@ -0,0 +1,180 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __LIBSGM_H__
 #define __LIBSGM_H__
 /**
 * @mainpage stereo-sgm
 * See sgm::StereoSGM
 */
 /**
 * @file libsgm.h
 * stereo-sgm main header
 */
 #include "libsgm_config.h"
 #if defined(LIBSGM_SHARED)
 #if defined(WIN32) || defined(_WIN32)
 #if defined sgm_EXPORTS
 #define LIBSGM_API __declspec(dllexport)
 #else
 #define LIBSGM_API __declspec(dllimport)
 #endif
 #else
 #define LIBSGM_API __attribute__((visibility("default")))
 #endif
 #else
 #define LIBSGM_API
 #endif
 namespace sgm
 {
 /**
 * @brief Indicates input/output pointer type.
 */
 enum ExecuteInOut
 {
 	EXECUTE_INOUT_HOST2HOST = (0 << 1) | 0,
 	EXECUTE_INOUT_HOST2CUDA = (1 << 1) | 0,
 	EXECUTE_INOUT_CUDA2HOST = (0 << 1) | 1,
 	EXECUTE_INOUT_CUDA2CUDA = (1 << 1) | 1,
 };
 /**
 * @brief Indicates number of scanlines which will be used.
 */
 enum class PathType
 {
 	SCAN_4PATH, //>! Horizontal and vertical paths.
 	SCAN_8PATH  //>! Horizontal, vertical and oblique paths.
 };
 /**
 * @brief Indicates census type which will be used.
 */
 enum class CensusType
 {
 	CENSUS_9x7,
 	SYMMETRIC_CENSUS_9x7
 };
 /**
 * @brief StereoSGM class
 */
 class StereoSGM
 {
 public:
 	static const int SUBPIXEL_SHIFT = 4;
 	static const int SUBPIXEL_SCALE = (1 << SUBPIXEL_SHIFT);
 	/**
 	* @brief Available options for StereoSGM
 	*/
 	struct Parameters
 	{
 		int P1;
 		int P2;
 		float uniqueness;
 		bool subpixel;
 		PathType path_type;
 		int min_disp;
 		int LR_max_diff;
 		CensusType census_type;
 		/**
 		* @param P1 Penalty on the disparity change by plus or minus 1 between nieghbor pixels.
 		* @param P2 Penalty on the disparity change by more than 1 between neighbor pixels.
 		* @param uniqueness Margin in ratio by which the best cost function value should be at least second one.
 		* @param subpixel Disparity value has 4 fractional bits if subpixel option is enabled.
 		* @param path_type Number of scanlines used in cost aggregation.
 		* @param min_disp Minimum possible disparity value.
 		* @param LR_max_diff Acceptable difference pixels which is used in LR check consistency. LR check consistency will be disabled if this value is set to negative.
 		* @param census_type Type of census transform.
 		*/
 		LIBSGM_API Parameters(int P1 = 10, int P2 = 120, float uniqueness = 0.95f, bool subpixel = false, PathType path_type = PathType::SCAN_8PATH,
 			int min_disp = 0, int LR_max_diff = 1, CensusType census_type = CensusType::SYMMETRIC_CENSUS_9x7);
 	};
 	/**
 	* @param width Processed image's width.
 	* @param height Processed image's height.
 	* @param disparity_size It must be 64, 128 or 256.
 	* @param input_depth_bits Processed image's bits per pixel. It must be 8, 16 or 32.
 	* @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16.
 	* @param inout_type Specify input/output pointer type. See sgm::EXECUTE_TYPE.
 	* @attention
 	* output_depth_bits must be set to 16 when subpixel is enabled.
 	*/
 	LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits,
 		ExecuteInOut inout_type, const Parameters& param = Parameters());
 	/**
 	* @param width Processed image's width.
 	* @param height Processed image's height.
 	* @param disparity_size It must be 64, 128 or 256.
 	* @param input_depth_bits Processed image's bits per pixel. It must be 8, 16 or 32.
 	* @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16.
 	* @param src_pitch Source image's pitch (pixels).
 	* @param dst_pitch Destination image's pitch (pixels).
 	* @param inout_type Specify input/output pointer type. See sgm::EXECUTE_TYPE.
 	* @attention
 	* output_depth_bits must be set to 16 when subpixel is enabled.
 	*/
 	LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, int src_pitch, int dst_pitch,
 		ExecuteInOut inout_type, const Parameters& param = Parameters());
 	LIBSGM_API virtual ~StereoSGM();
 	/**
 	* Execute stereo semi global matching.
 	* @param left_pixels  A pointer stored input left image.
 	* @param right_pixels A pointer stored input right image.
 	* @param dst          Output pointer. User must allocate enough memory.
 	* @attention
 	* You need to allocate dst memory at least width x height x sizeof(element_type) bytes.
 	* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
 	* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
 	* Value of Invalid disparity is equal to return value of `get_invalid_disparity` member function.
 	*/
 	LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst);
 	/**
 	* Generate invalid disparity value from Parameter::min_disp and Parameter::subpixel
 	* @attention
 	* Cast properly if you receive disparity value as `unsigned` type.
 	* See sample/movie for an example of this.
 	*/
 	LIBSGM_API int get_invalid_disparity() const;
 private:
 	StereoSGM(const StereoSGM&);
 	StereoSGM& operator=(const StereoSGM&);
 	class Impl;
 	Impl* impl_;
 };
 } // namespace sgm
 #endif // !__LIBSGM_H__
 #include "libsgm_wrapper.h"
@@ -0,0 +1,13 @@
 #ifndef __LIBSGM_CONFIG_H__
 #define __LIBSGM_CONFIG_H__
 #cmakedefine LIBSGM_SHARED
 #define LIBSGM_VERSION @libSGM_VERSION@
 #define LIBSGM_VERSION_MAJOR @libSGM_VERSION_MAJOR@
 #define LIBSGM_VERSION_MINOR @libSGM_VERSION_MINOR@
 #define LIBSGM_VERSION_PATCH @libSGM_VERSION_PATCH@
 #cmakedefine BUILD_OPENCV_WRAPPER
 #endif // __LIBSGM_CONFIG_H__
@@ -0,0 +1,84 @@
 #ifndef __LIBSGM_WRAPPER_H__
 #define __LIBSGM_WRAPPER_H__
 #include "libsgm.h"
 #include <memory>
 #ifdef BUILD_OPENCV_WRAPPER
 #include <opencv2/core/cuda.hpp>
 #endif
 namespace sgm
 {
 /**
 * @brief LibSGMWrapper class which is wrapper for sgm::StereoSGM.
 */
 class LibSGMWrapper
 {
 public:
 	/**
 	* @param numDisparity Maximum disparity minus minimum disparity.
 	* @param P1 Penalty on the disparity change by plus or minus 1 between nieghbor pixels.
 	* @param P2 Penalty on the disparity change by more than 1 between neighbor pixels.
 	* @param uniquenessRatio Margin in ratio by which the best cost function value should be at least second one.
 	* @param subpixel Disparity value has 4 fractional bits if subpixel option is enabled.
 	* @param pathType Number of scanlines used in cost aggregation.
 	* @param minDisparity Minimum possible disparity value.
 	* @param lrMaxDiff Acceptable difference pixels which is used in LR check consistency. LR check consistency will be disabled if this value is set to negative.
 	* @param censusType Type of census transform.
 	*/
 	LIBSGM_API LibSGMWrapper(int numDisparity = 128, int P1 = 10, int P2 = 120, float uniquenessRatio = 0.95f,
 		bool subpixel = false, PathType pathType = PathType::SCAN_8PATH, int minDisparity = 0, int lrMaxDiff = 1, CensusType censusType = CensusType::SYMMETRIC_CENSUS_9x7);
 	LIBSGM_API ~LibSGMWrapper();
 	LIBSGM_API int getNumDisparities() const;
 	LIBSGM_API int getP1() const;
 	LIBSGM_API int getP2() const;
 	LIBSGM_API float getUniquenessRatio() const;
 	LIBSGM_API bool hasSubpixel() const;
 	LIBSGM_API PathType getPathType() const;
 	LIBSGM_API int getMinDisparity() const;
 	LIBSGM_API int getLrMaxDiff() const;
 	LIBSGM_API CensusType getCensusType() const;
 	LIBSGM_API int getInvalidDisparity() const;
 #ifdef BUILD_OPENCV_WRAPPER
 	/**
 	* Execute stereo semi global matching via wrapper class.
 	* @param I1        Input left image.  Image's type is must be CV_8U, CV_16U or CV_32S
 	* @param I2        Input right image.  Image's size and type must be same with I1.
 	* @param disparity Output image.  Its memory will be allocated automatically dependent on input image size.
 	* @attention
 	* type of output image `disparity` is CV_16S.
 	* Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
 	*/
 	LIBSGM_API void execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity);
 	/**
 	* Execute stereo semi global matching via wrapper class.
 	* @param I1        Input left image.  Image's type is must be CV_8U, CV_16U or CV_32S.
 	* @param I2        Input right image.  Image's size and type must be same with I1.
 	* @param disparity Output image.  Its memory will be allocated automatically dependent on input image size.
 	* @attention
 	* type of output image `disparity` is CV_16S.
 	* Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
 	*/
 	LIBSGM_API void execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity);
 #endif // BUILD_OPRENCV_WRAPPER
 private:
 	struct Creator;
 	std::unique_ptr<sgm::StereoSGM> sgm_;
 	int numDisparity_;
 	sgm::StereoSGM::Parameters param_;
 	std::unique_ptr<Creator> prev_;
 };
 } // namespace sgm
 #endif // __LIBSGM_WRAPPER_H__
@@ -0,0 +1,9 @@
 %YAML:1.0
 ---
 Q: !!opencv-matrix
   rows: 4
   cols: 4
   dt: d
   data: [ 1., 0., 0., -452.58969879150391, 0., 1., 0.,
       -732.08112335205078, 0., 0., 0., 3269.0086731896672, 0., 0.,
       1.0200604866284457, 1125.7629393222996 ]
@@ -0,0 +1,18 @@
 # 05_disparity/libsgm — dependencies
 #
 # The Python pipeline runner (run_sgm_pipeline.py) uses only the standard library.
 # Disparity computation is done by the compiled stereosgm_new binary (CUDA C++).
 #
 # --- System build requirements (not installable via pip) ---
 #   - NVIDIA GPU with CUDA compute capability >= 3.5
 #   - CUDA Toolkit 11.x or 12.x
 #   - CMake >= 3.18
 #   - OpenCV (C++ headers + libs, for building libSGM samples)
 #
 # Build:
 #   cd ~/Speckle-Scanner/05_disparity/libsgm
 #   mkdir -p build && cd build
 #   cmake .. -DENABLE_SAMPLES=on
 #   make stereosgm_new -j4
 #
 # No pip packages required to run run_sgm_pipeline.py after the binary is built.
@@ -0,0 +1,201 @@
 """
 Pipeline runner for libSGM stereo disparity.
 Resolves all paths from the project folder structure and drives
 the stereosgm_new binary for each scan in a session (or all sessions on a date).
 For each scan it takes the LAST matched lc_/rc_ image pair from 02_rect_images/
 (images sorted by timestamp — highest timestamp = last acquired image).
 Output layout per scan:
  <processing_dir>/<project>/<date>/<session>/<scan>/
    02_rect_images/     <- input (lc_ts<last>.png + rc_ts<same>.png)
    03_sgm_disp_map/    <- disparity.xml + disparity_color.png  (created here)
    05_sgm_pcl/         <- untouched
 Binary:
  ~/Speckle-Scanner/05_disparity/libsgm/build/sample/stereosgm_new
 """
 import sys
 import re
 import argparse
 import subprocess
 from pathlib import Path
 # Resolve config.py from ~/Speckle-Scanner regardless of CWD
 sys.path.insert(0, str(Path.home() / "Speckle-Scanner"))
 import config  # noqa: E402
 BINARY = Path(__file__).parent / "build" / "sample" / "stereosgm_new"
 def extract_ts_token(filename, prefix="lc_"):
    """Extract ts token from lc_ts1634840093.png or lc_ts1634840093_ck....png."""
    m = re.search(rf"^{re.escape(prefix)}(ts\d+)", filename, re.IGNORECASE)
    if not m:
        return None, None
    ts_token = m.group(1).lower()
    ts_int = int(re.search(r"\d+", ts_token).group())
    return ts_token, ts_int
 def find_rc_for_ts(rect_dir, ts_token):
    """Match rc image by shared ts token (ck suffix optional)."""
    rc_matches = sorted(rect_dir.glob(f"rc_{ts_token}_*.png"))
    if not rc_matches:
        rc_matches = sorted(rect_dir.glob(f"rc_{ts_token}*.png"))
    return rc_matches[0] if rc_matches else None
 def find_last_lc_rc_pair(rect_dir):
    """Return (lc_path, rc_path) for the highest-timestamp matched pair in rect_dir."""
    rect_dir = Path(rect_dir)
    pairs = []
    for lc in rect_dir.glob("lc_ts*.png"):
        ts_token, ts_int = extract_ts_token(lc.name, "lc_")
        if ts_token is None:
            continue
        rc = find_rc_for_ts(rect_dir, ts_token)
        if rc is None:
            continue
        pairs.append((ts_int, lc, rc))
    if not pairs:
        return None, None
    pairs.sort(key=lambda item: item[0])
    _, lc, rc = pairs[-1]
    return lc, rc
 def build_cmd(lc, rc, output_dir, sgm_args):
    cmd = [
        str(BINARY),
        str(lc),
        str(rc),
        f"--output_dir={output_dir}",
        "--no_display=1",
    ]
    for key, val in sgm_args.items():
        if val is not None:
            cmd.append(f"--{key}={val}")
    return cmd
 def run_scan(project, date, session, scan, sgm_args):
    rect_dir = config.PROCESSING_DIR / project / date / session / scan / "02_rect_images"
    if not rect_dir.exists():
        print(f"[SKIP] {session}/{scan}: 02_rect_images not found at {rect_dir}")
        return False
    lc, rc = find_last_lc_rc_pair(rect_dir)
    if lc is None:
        print(f"[SKIP] {session}/{scan}: no lc_ts*.png images found in {rect_dir}")
        return False
    if rc is None:
        print(f"[SKIP] {session}/{scan}: no matching rc image for {lc.name}")
        return False
    output_dir = config.get_processing_step_dir(project, date, session, scan, "03_sgm_disp_map")
    print(f"\n{'='*60}")
    print(f"[SCAN] {session}/{scan}")
    print(f"  lc     : {lc.name}")
    print(f"  rc     : {rc.name}")
    print(f"  output : {output_dir}")
    print(f"{'='*60}")
    cmd = build_cmd(lc, rc, output_dir, sgm_args)
    result = subprocess.run(cmd)
    if result.returncode != 0:
        print(f"[FAIL] {session}/{scan} exited with code {result.returncode}")
        return False
    print(f"[DONE] {session}/{scan}")
    return True
 def run_session(project, date, session, scan_arg, sgm_args):
    if scan_arg:
        scans = [scan_arg]
    else:
        scans = config.list_scan_dirs(project, date, session)
        if not scans:
            print(f"[WARN] No scan folders found in {project}/{date}/{session}")
            return [], []
        print(f"\n  Session {session}: {len(scans)} scan(s) found")
    failed = []
    for scan in scans:
        ok = run_scan(project, date, session, scan, sgm_args)
        if not ok:
            failed.append(f"{session}/{scan}")
    return scans, failed
 def main():
    parser = argparse.ArgumentParser(
        description="libSGM disparity pipeline runner — resolves paths from project structure"
    )
    # Project location
    parser.add_argument("--project", required=True,  help="Project name (e.g. Olsen_wings)")
    parser.add_argument("--date",    required=True,  help="Date string (e.g. 2026-05-12)")
    parser.add_argument("--session", default=None,   help="Session name (e.g. session1); omit to process ALL sessions on that date")
    parser.add_argument("--scan",    default=None,   help="Single scan (e.g. Scan000001); omit to process all scans in the session")
    # SGM parameters — all optional, forwarded to stereosgm_new
    parser.add_argument("--disp_size",   type=int,   default=None, help="Maximum disparity value (64, 128, or 256; default 256)")
    parser.add_argument("--P1",          type=int,   default=None, help="SGM penalty for disparity change of ±1 (default 10)")
    parser.add_argument("--P2",          type=int,   default=None, help="SGM penalty for disparity change >1 (default 120)")
    parser.add_argument("--uniqueness",  type=float, default=None, help="Uniqueness ratio threshold (default 0.80)")
    parser.add_argument("--num_paths",   type=int,   default=None, choices=[4, 8], help="Scanlines for cost aggregation: 4 or 8 (default 8)")
    parser.add_argument("--min_disp",    type=int,   default=None, help="Minimum disparity value (default -160)")
    parser.add_argument("--LR_max_diff", type=int,   default=None, help="Max left-right disparity difference (default 1)")
    parser.add_argument("--census_type", type=int,   default=None, choices=[0, 1], help="Census transform type: 0=CENSUS_9x7, 1=SYMMETRIC_CENSUS_9x7 (default 1)")
    args = parser.parse_args()
    if not BINARY.exists():
        print(f"ERROR: stereosgm_new binary not found at {BINARY}")
        print("Build it first: cd ~/Speckle-Scanner/05_disparity/libsgm/build && make stereosgm_new")
        sys.exit(1)
    sgm_args = {
        "disp_size":   args.disp_size,
        "P1":          args.P1,
        "P2":          args.P2,
        "uniqueness":  args.uniqueness,
        "num_paths":   args.num_paths,
        "min_disp":    args.min_disp,
        "LR_max_diff": args.LR_max_diff,
        "census_type": args.census_type,
    }
    # Determine sessions to process
    if args.session:
        sessions = [args.session]
    else:
        sessions = config.list_session_dirs(args.project, args.date)
        if not sessions:
            print(f"No session folders found under {args.project}/{args.date}")
            sys.exit(1)
        print(f"Found {len(sessions)} session(s): {sessions}")
    total_scans = 0
    all_failed  = []
    for session in sessions:
        scans, failed = run_session(
            args.project, args.date, session, args.scan, sgm_args
        )
        total_scans += len(scans)
        all_failed.extend(failed)
    print(f"\n{'='*60}")
    print(f"Finished: {total_scans - len(all_failed)}/{total_scans} scans succeeded.")
    if all_failed:
        print(f"Failed: {all_failed}")
        sys.exit(1)
 if __name__ == "__main__":
    main()
@@ -0,0 +1,62 @@
 cmake_minimum_required(VERSION 3.18)
 project(samples LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_EXTENSIONS OFF)
 # required packages
 find_package(OpenCV REQUIRED)
 set(SRCS_COMMON sample_common.cpp sample_common.h)
 # sample image
 add_executable(stereosgm_image stereosgm_image.cpp ${SRCS_COMMON})
 target_include_directories(stereosgm_image PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(stereosgm_image sgm ${OpenCV_LIBS})
 # sample movie
 add_executable(stereosgm_movie stereosgm_movie.cpp ${SRCS_COMMON})
 target_include_directories(stereosgm_movie PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(stereosgm_movie sgm ${OpenCV_LIBS})
 # sample mynew
 add_executable(stereosgm_new stereosgm_new.cpp ${SRCS_COMMON})
 target_include_directories(stereosgm_new PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(stereosgm_new sgm ${OpenCV_LIBS})
 # sample benchmark
 add_executable(stereosgm_benchmark stereosgm_benchmark.cpp ${SRCS_COMMON})
 target_include_directories(stereosgm_benchmark PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(stereosgm_benchmark sgm ${OpenCV_LIBS})
 # sample reprojection
 add_executable(stereosgm_reprojection stereosgm_reprojection.cpp ${SRCS_COMMON})
 target_include_directories(stereosgm_reprojection PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(stereosgm_reprojection sgm ${OpenCV_LIBS})
 # sample image with cv::GpuMat
 if(BUILD_OPENCV_WRAPPER)
 	add_executable(stereosgm_image_cv_gpumat stereosgm_image_cv_gpumat.cpp ${SRCS_COMMON})
 	target_include_directories(stereosgm_image_cv_gpumat PRIVATE ${OpenCV_INCLUDE_DIRS})
 	target_link_libraries(stereosgm_image_cv_gpumat sgm ${OpenCV_LIBS})
 endif()
 # sample ZED camera
 if(ENABLE_ZED_DEMO)
 	if(WIN32)
 		set(ZED_SDK_LIB "C:\\Program Files (x86)\\ZED SDK\\lib\\sl_zed64.lib" CACHE STRING "ZED SDK library(sl_zed**.llb) path.")
 		set(ZED_SDK_INCLUDE_DIR "C:\\Program Files (x86)\\ZED SDK\\include" CACHE STRING "ZED SDK include path.")
 	else()
 		set(ZED_SDK_LIB "/usr/local/zed/lib/libsl_zed.so" CACHE STRING "ZED SDK library(sl_zed**.llb) path.")
 		set(ZED_SDK_INCLUDE_DIR "/usr/local/zed/include" CACHE STRING "ZED SDK include path.")
 	endif()
 	find_package(ZED 3 REQUIRED)
 	string(REGEX REPLACE [[; +]] [[;]] CUDA_NPP_LIBRARIES_ZED "${CUDA_NPP_LIBRARIES_ZED}")
 	add_executable(stereosgm_zed stereosgm_zed.cpp ${SRCS_COMMON})
 	target_include_directories(stereosgm_zed PRIVATE ${OpenCV_INCLUDE_DIRS} ${ZED_INCLUDE_DIRS})
 	target_link_directories(stereosgm_zed PRIVATE ${ZED_LIBRARY_DIR})
 	target_link_libraries(stereosgm_zed sgm ${OpenCV_LIBS} ${ZED_LIBRARIES} ${CUDA_NPP_LIBRARIES_ZED})
 endif()
@@ -0,0 +1,15 @@
 <?xml version="1.0"?>
 <opencv_storage>
 <!--  Intrinsic parameters -->
 <FocalLengthX>1267.485352</FocalLengthX> <!--  focal length x (pixel) -->
 <FocalLengthY>1224.548950</FocalLengthY> <!--  focal length y (pixel) -->
 <CenterX>472.735474</CenterX>            <!--  principal point x (pixel) -->
 <CenterY>175.787781</CenterY>            <!--  principal point y (pixel) -->
 <!--  Extrinsic parameters -->
 <BaseLine>0.214382</BaseLine>            <!--  baseline (meter) -->
 <Height>1.170000</Height>                <!--  height position (meter) -->
 <Tilt>0.081276</Tilt>                    <!--  tilt angle (radian) -->
 </opencv_storage>
@@ -0,0 +1,10 @@
 <?xml version="1.0"?>
 <opencv_storage>
 <FocalLengthX>1249.7700195</FocalLengthX>
 <FocalLengthY>1249.7700195</FocalLengthY>
 <CenterX>480.8460083</CenterX>
 <CenterY>237.4100037</CenterY>
 <BaseLine>0.2339240</BaseLine>
 <Height>1.2000000</Height>
 <Tilt>0.07</Tilt>
 </opencv_storage>
@@ -0,0 +1,4 @@
 # sample mynew
 add_executable(stereosgm_new stereosgm_new.cpp ${SRCS_COMMON})
 target_include_directories(stereosgm_new PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(stereosgm_new sgm ${OpenCV_LIBS})
@@ -0,0 +1,160 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <chrono>
 #include <stdexcept>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <fstream> // Add this line to use std::ofstream for file output
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left-image-format  | <none> | format string for path to input left image  }"
 "{ @right-image-format | <none> | format string for path to input right image }"
 "{ disp_size           |    256 | maximum possible disparity value            }"
 "{ start_number        |      0 | index to start reading                      }"
 "{ help h              |        | display this help and exit                  }";
 class ImagePreprocessor {
 public:
    void preprocess_image_pair(cv::Mat& img_left, cv::Mat& img_right) {
        // Get the shape of both images
        int h1 = img_left.rows, w1 = img_left.cols;
        int h2 = img_right.rows, w2 = img_right.cols;
        // Find the minimum height and width between the two images
        int min_height = std::min(h1, h2);
        int min_width = std::min(w1, w2);
        // Crop both images to match the minimum height and width
        img_left = img_left(cv::Rect(0, 0, min_width, min_height));
        img_right = img_right(cv::Rect(0, 0, min_width, min_height));
        // Convert to CV_8U grayscale
        //cv::cvtColor(img_left, img_left, cv::COLOR_BGR2GRAY);
        img_left.convertTo(img_left, CV_8U); // Ensure it's in CV_8U format
        //cv::cvtColor(img_right, img_right, cv::COLOR_BGR2GRAY);
        img_right.convertTo(img_right, CV_8U); // Ensure it's in CV_8U format
    }
 };
 int main(int argc, char* argv[])
 {
    cv::CommandLineParser parser(argc, argv, keys);
    if (parser.has("help")) {
        parser.printMessage();
        return 0;
    }
    const std::string image_format_L = parser.get<cv::String>("@left-image-format");
    const std::string image_format_R = parser.get<cv::String>("@right-image-format");
    const int disp_size = parser.get<int>("disp_size");
    const int start_number = parser.get<int>("start_number");
    if (!parser.check()) {
        parser.printErrors();
        parser.printMessage();
        std::exit(EXIT_FAILURE);
    }
    cv::Mat I1, I2;
    ImagePreprocessor preprocessor; // Create an instance of the ImagePreprocessor class
    for (int frame_no = start_number;; frame_no++) {
        I1 = cv::imread(cv::format(image_format_L.c_str(), frame_no), cv::IMREAD_GRAYSCALE);
        I2 = cv::imread(cv::format(image_format_R.c_str(), frame_no), cv::IMREAD_GRAYSCALE);
        // Check if images are empty, if so break the loop
        if (I1.empty() || I2.empty()) {
            std::cout << "No more images to process or image pair not found." << std::endl;
            break;
        }
        // Preprocess the images
        preprocessor.preprocess_image_pair(I1, I2);
        const int width = I1.cols;
        const int height = I1.rows;
        const int src_depth = I1.type() == CV_8U ? 8 : 16;
        const int dst_depth = disp_size < 256 ? 8 : 16;
        const int src_bytes = src_depth * width * height / 8;
        const int dst_bytes = dst_depth * width * height / 8;
        sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA);
        device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
        cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color;
        const int invalid_disp = sgm.get_invalid_disparity();
        d_I1.upload(I1.data);
        d_I2.upload(I2.data);
        const auto t1 = std::chrono::system_clock::now();
        sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
        cudaDeviceSynchronize();
        const auto t2 = std::chrono::system_clock::now();
        const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
        const double fps = 1e6 / duration;
        d_disparity.download(disparity.data);
        cv::imwrite(cv::format("disparity_output_%04d.png", frame_no), disparity);
        // Save disparity map as text file with pixel values
        //std::ofstream disparity_file(cv::format("disparity_output_%04d.txt", frame_no));
        //if (disparity_file.is_open()) {
         //   for (int y = 0; y < disparity.rows; ++y) {
          //      for (int x = 0; x < disparity.cols; ++x) {
           //         disparity_file << disparity.at<short>(y, x) << " "; // Assuming disparity is CV_16S
           //     }
            //    disparity_file << std::endl;
          //  }
           // disparity_file.close();
        //} else {
          //  std::cerr << "Error: Could not open text file for disparity output." << std::endl;
        //}
        // Print the size of the disparity map in MB
        double disparity_size_mb = static_cast<double>(dst_bytes) / (1024 * 1024);
        std::cout << "Size of disparity map: " << disparity_size_mb << " MB" << std::endl;
        // Draw results
        if (I1.type() != CV_8U)
            cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
        colorize_disparity(disparity, disparity_color, disp_size, disparity == invalid_disp);
        cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
            1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
        cv::imshow("left image", I1);
        cv::imshow("disparity", disparity_color);
        cv::waitKey(0); // Hold the window open for inspection; press any key to continue
    }
    return 0;
 }
@@ -0,0 +1,29 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "sample_common.h"
 #include <opencv2/imgproc.hpp>
 void colorize_disparity(const cv::Mat& src, cv::Mat& dst, int disp_size, cv::InputArray mask)
 {
 	cv::Mat tmp;
 	src.convertTo(tmp, CV_8U, 255. / disp_size);
 	cv::applyColorMap(tmp, dst, cv::COLORMAP_TURBO);
 	if (!mask.empty())
 		dst.setTo(0, mask);
 }
@@ -0,0 +1,45 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __SAMPLE_COMMON_H__
 #define __SAMPLE_COMMON_H__
 #include <opencv2/core.hpp>
 #include <cuda_runtime.h>
 #define ASSERT_MSG(expr, msg) \
 if (!(expr)) { \
 	std::cerr << msg << std::endl; \
 	std::exit(EXIT_FAILURE); \
 } \
 struct device_buffer
 {
 	device_buffer() : data(nullptr), size(0) {}
 	device_buffer(size_t count) : device_buffer() { allocate(count); }
 	~device_buffer() { cudaFree(data); }
 	void allocate(size_t count) { cudaMalloc(&data, count); size = count; }
 	void upload(const void* h_data) { cudaMemcpy(data, h_data, size, cudaMemcpyHostToDevice); }
 	void download(void* h_data) { cudaMemcpy(h_data, data, size, cudaMemcpyDeviceToHost); }
 	void* data;
 	size_t size;
 };
 void colorize_disparity(const cv::Mat& src, cv::Mat& dst, int disp_size, cv::InputArray mask = cv::noArray());
 #endif // !__SAMPLE_COMMON_H__
@@ -0,0 +1,140 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <iomanip>
 #include <chrono>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left_img   | <none> | path to input left image                                       }"
 "{ @right_img  | <none> | path to input right image                                      }"
 "{ disp_size   |    128 | maximum possible disparity value                               }"
 "{ out_depth   |      8 | disparity image's bits per pixel                               }"
 "{ subpixel    |        | enable subpixel estimation                                     }"
 "{ num_paths   |      8 | number of scanlines used in cost aggregation                   }"
 "{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7) }"
 "{ iterations  |    100 | number of iterations for measuring performance                 }"
 "{ help h      |        | display this help and exit                                     }";
 int main(int argc, char* argv[])
 {
 	cv::CommandLineParser parser(argc, argv, keys);
 	if (parser.has("help")) {
 		parser.printMessage();
 		return 0;
 	}
 	cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
 	cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
 	const int disp_size = parser.get<int>("disp_size");
 	const int dst_depth = parser.get<int>("out_depth");
 	const bool subpixel = parser.has("subpixel");
 	const int num_paths = parser.get<int>("num_paths");
 	const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
 	const int iterations = parser.get<int>("iterations");
 	if (!parser.check()) {
 		parser.printErrors();
 		parser.printMessage();
 		std::exit(EXIT_FAILURE);
 	}
 	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
 	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
 	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
 	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
 	ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
 	ASSERT_MSG(census_type == sgm::CensusType::CENSUS_9x7 || census_type == sgm::CensusType::SYMMETRIC_CENSUS_9x7, "census type must be 0 or 1.");
 	ASSERT_MSG(dst_depth == 8 || dst_depth == 16, "output depth bits must be 8 or 16");
 	if (subpixel)
 		ASSERT_MSG(dst_depth == 16, "output depth bits must be 16 if subpixel option is enabled.");
 	const int width = I1.cols;
 	const int height = I1.rows;
 	const int src_depth = I1.type() == CV_8U ? 8 : 16;
 	const int src_bytes = src_depth * width * height / 8;
 	const int dst_bytes = dst_depth * width * height / 8;
 	const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
 	const sgm::StereoSGM::Parameters param(10, 120, 0.95f, subpixel, path_type, 0, 1, census_type);
 	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, param);
 	device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
 	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S);
 	d_I1.upload(I1.data);
 	d_I2.upload(I2.data);
 	cudaDeviceProp prop;
 	int version;
 	cudaGetDeviceProperties(&prop, 0);
 	cudaRuntimeGetVersion(&version);
 	// show settings
 	std::cout << "# Settings" << std::endl;
 	std::cout << "device name         : " << prop.name << std::endl;
 	std::cout << "CUDA runtime version: " << version << std::endl;
 	std::cout << "image size          : " << I1.size() << std::endl;
 	std::cout << "disparity size      : " << disp_size << std::endl;
 	std::cout << "output depth        : " << dst_depth << std::endl;
 	std::cout << "subpixel option     : " << (subpixel ? "true" : "false") << std::endl;
 	std::cout << "sgm path            : " << num_paths << " path" << std::endl;
 	std::cout << "census type         : " << (census_type == sgm::CensusType::CENSUS_9x7 ? "CENSUS_9x7" : "SYMMETRIC_CENSUS_9x7") << std::endl;
 	std::cout << "iterations          : " << iterations << std::endl;
 	std::cout << std::endl;
 	// run benchmark
 	std::cout << "Running benchmark..." << std::endl;
 	uint64_t sum = 0;
 	for (int i = 0; i <= iterations; i++) {
 		const auto t1 = std::chrono::system_clock::now();
 		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
 		cudaDeviceSynchronize();
 		const auto t2 = std::chrono::system_clock::now();
 		if (i > 0)
 			sum += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
 	}
 	std::cout << "Done." << std::endl << std::endl;
 	// show results
 	const double time_millisec = 1e-3 * sum / iterations;
 	const double fps = 1e3 / time_millisec;
 	std::cout << "# Results" << std::endl;
 	std::cout.setf(std::ios::fixed);
 	std::cout << std::setprecision(1) << "Processing Time[Milliseconds]: " << time_millisec << std::endl;
 	std::cout << std::setprecision(1) << "FPS                          : " << fps << std::endl;
 	std::cout << std::endl;
 	// save disparity image
 	const int disp_scale = subpixel ? sgm::StereoSGM::SUBPIXEL_SCALE : 1;
 	d_disparity.download(disparity.data);
 	colorize_disparity(disparity, disparity, disp_scale * disp_size, disparity == sgm.get_invalid_disparity());
 	cv::imwrite("disparity.png", disparity);
 	return 0;
 }
@@ -0,0 +1,118 @@
 #include <iostream>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <opencv2/core/utils/filesystem.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left-image-format  | <none> | format string for path to input left image  }"
 "{ @right-image-format | <none> | format string for path to input right image }"
 "{ disp_size           |    256 | maximum possible disparity value            }"
 "{ P1                  |     10 | penalty on the disparity change by plus or minus 1 }"
 "{ P2                  |    120 | penalty on the disparity change by more than 1 }"
 "{ uniqueness          |   0.80 | margin in ratio for best cost function value }"
 "{ num_paths           |      8 | number of scanlines used in cost aggregation }"
 "{ min_disp            |   -160 | minimum disparity value                     }"
 "{ LR_max_diff         |      1 | max allowed difference between L/R disparity }"
 "{ census_type         |      1 | type of census transform                    }"
 "{ interval            |      1 | polling interval in seconds                 }"
 "{ help h              |        | display this help and exit                  }";
 class ImagePreprocessor {
 public:
    void preprocess_image_pair(cv::Mat& img_left, cv::Mat& img_right) {
        if (img_left.channels() > 1) cv::cvtColor(img_left, img_left, cv::COLOR_BGR2GRAY);
        if (img_right.channels() > 1) cv::cvtColor(img_right, img_right, cv::COLOR_BGR2GRAY);
        int min_height = std::min(img_left.rows, img_right.rows);
        int min_width = std::min(img_left.cols, img_right.cols);
        img_left = img_left(cv::Rect(0, 0, min_width, min_height));
        img_right = img_right(cv::Rect(0, 0, min_width, min_height));
    }
 };
 bool disparityAlreadyProcessed(int frame_no) {
    std::string xml_path = cv::format("output/disparity_%04d.xml", frame_no);
    return cv::utils::fs::exists(xml_path);
 }
 int main(int argc, char* argv[]) {
    cv::CommandLineParser parser(argc, argv, keys);
    if (parser.has("help")) {
        parser.printMessage();
        return 0;
    }
    const std::string format_L = parser.get<cv::String>("@left-image-format");
    const std::string format_R = parser.get<cv::String>("@right-image-format");
    const int disp_size = parser.get<int>("disp_size");
    const int P1 = parser.get<int>("P1");
    const int P2 = parser.get<int>("P2");
    const float uniqueness = parser.get<float>("uniqueness");
    const int num_paths = parser.get<int>("num_paths");
    const int min_disp = parser.get<int>("min_disp");
    const int LR_max_diff = parser.get<int>("LR_max_diff");
    const int interval = parser.get<int>("interval");
    const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
    if (!parser.check()) {
        parser.printErrors();
        parser.printMessage();
        std::exit(EXIT_FAILURE);
    }
    if (!cv::utils::fs::exists("output")) {
        cv::utils::fs::createDirectory("output");
    }
    ImagePreprocessor preprocessor;
    const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
    const sgm::StereoSGM::Parameters param(P1, P2, uniqueness, false, path_type, min_disp, LR_max_diff, census_type);
    int last_checked = 0;
    while (true) {
        const std::string left_path = cv::format(format_L.c_str(), last_checked);
        const std::string right_path = cv::format(format_R.c_str(), last_checked);
        if (cv::utils::fs::exists(left_path) && cv::utils::fs::exists(right_path) && !disparityAlreadyProcessed(last_checked)) {
            cv::TickMeter timer;
            timer.start();
            std::cout << "Processing frame " << last_checked;
            cv::Mat I1 = cv::imread(left_path, cv::IMREAD_UNCHANGED);
            cv::Mat I2 = cv::imread(right_path, cv::IMREAD_UNCHANGED);
            if (I1.empty() || I2.empty()) {
                std::cerr << "Error reading images." << std::endl;
                break;
            }
            preprocessor.preprocess_image_pair(I1, I2);
            ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "Mismatched image size/type.");
            ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "Images must be CV_8U or CV_16U.");
            const int src_depth = I1.type() == CV_8U ? 8 : 16;
            const int dst_depth = 16;
            sgm::StereoSGM ssgm(I1.cols, I1.rows, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_HOST2HOST, param);
            cv::Mat disparity(I1.size(), CV_16S);
            ssgm.execute(I1.data, I2.data, disparity.data);
            cv::FileStorage fs(cv::format("output/disparity_%04d.xml", last_checked), cv::FileStorage::WRITE);
            fs << "disparity" << disparity;
            fs.release();
            timer.stop();
            std::cout << " - " << timer.getTimeSec() << " seconds" << std::endl;
        }
        last_checked++;
        cv::waitKey(interval * 1000);  // Sleep for polling interval
    }
    return 0;
 }
@@ -0,0 +1,120 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left_img   | <none> | path to input left image                                                            }"
 "{ @right_img  | <none> | path to input right image                                                           }"
 "{ disp_size   |     64 | maximum possible disparity value                                                    }"
 "{ P1          |     10 | penalty on the disparity change by plus or minus 1 between nieghbor pixels          }"
 "{ P2          |    120 | penalty on the disparity change by more than 1 between neighbor pixels              }"
 "{ uniqueness  |   0.95 | margin in ratio by which the best cost function value should be at least second one }"
 "{ num_paths   |      8 | number of scanlines used in cost aggregation                                        }"
 "{ min_disp    |      0 | minimum disparity value                                                             }"
 "{ LR_max_diff |      1 | maximum allowed difference between left and right disparity                         }"
 "{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7)                      }"
 "{ help h      |        | display this help and exit                                                          }";
 int main(int argc, char* argv[])
 {
 	cv::CommandLineParser parser(argc, argv, keys);
 	if (parser.has("help")) {
 		parser.printMessage();
 		return 0;
 	}
 	cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
 	cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
 	const int disp_size = parser.get<int>("disp_size");
 	const int P1 = parser.get<int>("P1");
 	const int P2 = parser.get<int>("P2");
 	const float uniqueness = parser.get<float>("uniqueness");
 	const int num_paths = parser.get<int>("num_paths");
 	const int min_disp = parser.get<int>("min_disp");
 	const int LR_max_diff = parser.get<int>("LR_max_diff");
 	const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
 	if (!parser.check()) {
 		parser.printErrors();
 		parser.printMessage();
 		std::exit(EXIT_FAILURE);
 	}
 	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
 	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
 	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
 	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
 	ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
 	ASSERT_MSG(census_type == sgm::CensusType::CENSUS_9x7 || census_type == sgm::CensusType::SYMMETRIC_CENSUS_9x7, "census type must be 0 or 1.");
 	const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
 	sgm::LibSGMWrapper sgm(disp_size, P1, P2, uniqueness, false, path_type, min_disp, LR_max_diff, census_type);
 	cv::Mat disparity;
 	try {
 		cv::cuda::GpuMat d_I1(I1), d_I2(I2), d_disparity;
 		sgm.execute(d_I1, d_I2, d_disparity);
 		d_disparity.download(disparity);
 	}
 	catch (const cv::Exception& e) {
 		std::cerr << e.what() << std::endl;
 		return e.code == cv::Error::GpuNotSupported ? 1 : -1;
 	}
 	// create mask for invalid disp
 	const cv::Mat mask = disparity == sgm.getInvalidDisparity();
 	// show image
 	cv::Mat disparity_8u, disparity_color;
 	disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size);
 	cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_TURBO);
 	disparity_8u.setTo(0, mask);
 	disparity_color.setTo(cv::Scalar::all(0), mask);
 	if (I1.type() != CV_8U)
 		cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
 	const std::vector<cv::Mat> images = { disparity_8u, disparity_color, I1 };
 	const std::vector<std::string> titles = { "disparity", "disparity color", "input" };
 	std::cout << "Hot keys:" << std::endl;
 	std::cout << "\tESC - quit the program" << std::endl;
 	std::cout << "\ts - switch display (disparity | colored disparity | input image)" << std::endl;
 	int mode = 0;
 	while (true) {
 		cv::setWindowTitle("image", titles[mode]);
 		cv::imshow("image", images[mode]);
 		const char c = cv::waitKey(0);
 		if (c == 's')
 			mode = (mode < 2 ? mode + 1 : 0);
 		if (c == 27)
 			break;
 	}
 	return 0;
 }
@@ -0,0 +1,121 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <chrono>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left-image-format  | <none> | format string for path to input left image  }"
 "{ @right-image-format | <none> | format string for path to input right image }"
 "{ disp_size           |    128 | maximum possible disparity value            }"
 "{ start_number        |      0 | index to start reading                      }"
 "{ help h              |        | display this help and exit                  }";
 int main(int argc, char* argv[])
 {
 	cv::CommandLineParser parser(argc, argv, keys);
 	if (parser.has("help")) {
 		parser.printMessage();
 		return 0;
 	}
 	const std::string image_format_L = parser.get<cv::String>("@left-image-format");
 	const std::string image_format_R = parser.get<cv::String>("@right-image-format");
 	const int disp_size = parser.get<int>("disp_size");
 	const int start_number = parser.get<int>("start_number");
 	if (!parser.check()) {
 		parser.printErrors();
 		parser.printMessage();
 		std::exit(EXIT_FAILURE);
 	}
 	cv::Mat I1 = cv::imread(cv::format(image_format_L.c_str(), start_number), cv::IMREAD_UNCHANGED);
 	cv::Mat I2 = cv::imread(cv::format(image_format_R.c_str(), start_number), cv::IMREAD_UNCHANGED);
 	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
 	if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
 	if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
 	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
 	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
 	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
 	const int width = I1.cols;
 	const int height = I1.rows;
 	const int src_depth = I1.type() == CV_8U ? 8 : 16;
 	const int dst_depth = disp_size < 256 ? 8 : 16;
 	const int src_bytes = src_depth * width * height / 8;
 	const int dst_bytes = dst_depth * width * height / 8;
 	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA);
 	device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
 	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color;
 	const int invalid_disp = sgm.get_invalid_disparity();
 	for (int frame_no = start_number;; frame_no++) {
 		I1 = cv::imread(cv::format(image_format_L.c_str(), frame_no), cv::IMREAD_UNCHANGED);
 		I2 = cv::imread(cv::format(image_format_R.c_str(), frame_no), cv::IMREAD_UNCHANGED);
 		if (I1.empty() || I2.empty()) {
 			frame_no = start_number - 1;
 			continue;
 		}
 		if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
 		if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
 		d_I1.upload(I1.data);
 		d_I2.upload(I2.data);
 		const auto t1 = std::chrono::system_clock::now();
 		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
 		cudaDeviceSynchronize();
 		const auto t2 = std::chrono::system_clock::now();
 		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
 		const double fps = 1e6 / duration;
 		d_disparity.download(disparity.data);
 		// draw results
 		if (I1.type() != CV_8U)
 			cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
 		colorize_disparity(disparity, disparity_color, disp_size, disparity == invalid_disp);
 		cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
 			1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
 		cv::imshow("left image", I1);
 		cv::imshow("disparity", disparity_color);
 		const char c = cv::waitKey(1);
 		if (c == 27) // ESC
 			break;
 	}
 	return 0;
 }
@@ -0,0 +1,124 @@
 #include <iostream>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left_img   | <none> | path to input left image                                                            }"
 "{ @right_img  | <none> | path to input right image                                                           }"
 "{ disp_size   |    256 | maximum possible disparity value                                                    }"
 "{ P1          |     10 | penalty on the disparity change by plus or minus 1 between neighbor pixels          }"
 "{ P2          |    120 | penalty on the disparity change by more than 1 between neighbor pixels              }"
 "{ uniqueness  |   0.80 | margin in ratio by which the best cost function value should be at least second one }"
 "{ num_paths   |      8 | number of scanlines used in cost aggregation                                        }"
 "{ min_disp    |   -160 | minimum disparity value                                                             }"
 "{ LR_max_diff |      1 | maximum allowed difference between left and right disparity                         }"
 "{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7)                      }"
 "{ output_dir  |      . | directory to save disparity.xml and disparity_color.png                             }"
 "{ no_display  |      0 | set to 1 to skip interactive display window (for pipeline/headless use)             }"
 "{ help h      |        | display this help and exit                                                          }";
 int main(int argc, char* argv[])
 {
    double start_time = cv::getTickCount();  // Start total execution time
    cv::CommandLineParser parser(argc, argv, keys);
    if (parser.has("help")) {
        parser.printMessage();
        return 0;
    }
    double load_start = cv::getTickCount(); // Start loading time
    cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
    cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
    double load_end = cv::getTickCount();
    double load_time_s = (load_end - load_start) / cv::getTickFrequency();  // Seconds
    double load_time_ms = load_time_s * 1000.0;  // Milliseconds
    std::cout << "Image Loading Time: " << load_time_s << " s (" << load_time_ms << " ms)" << std::endl;
    if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
    if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
    const int disp_size = parser.get<int>("disp_size");
    const int P1 = parser.get<int>("P1");
    const int P2 = parser.get<int>("P2");
    const float uniqueness = parser.get<float>("uniqueness");
    const int num_paths = parser.get<int>("num_paths");
    const int min_disp = parser.get<int>("min_disp");
    const int LR_max_diff = parser.get<int>("LR_max_diff");
    const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
    if (!parser.check()) {
        parser.printErrors();
        parser.printMessage();
        std::exit(EXIT_FAILURE);
    }
    ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
    ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
    ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
    ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
    ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
    ASSERT_MSG(census_type == sgm::CensusType::CENSUS_9x7 || census_type == sgm::CensusType::SYMMETRIC_CENSUS_9x7, "census type must be 0 or 1.");
    const int src_depth = I1.type() == CV_8U ? 8 : 16;
    const int dst_depth = 16;
    const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
    const sgm::StereoSGM::Parameters param(P1, P2, uniqueness, false, path_type, min_disp, LR_max_diff, census_type);
    sgm::StereoSGM ssgm(I1.cols, I1.rows, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_HOST2HOST, param);
    cv::Mat disparity(I1.size(), CV_16S);
    double disparity_start = cv::getTickCount(); // Start disparity computation time
    ssgm.execute(I1.data, I2.data, disparity.data);
    double disparity_end = cv::getTickCount();
    double disparity_time_s = (disparity_end - disparity_start) / cv::getTickFrequency();  // Seconds
    double disparity_time_ms = disparity_time_s * 1000.0;  // Milliseconds
    std::cout << "Disparity Computation Time: " << disparity_time_s << " s (" << disparity_time_ms << " ms)" << std::endl;
    const std::string output_dir = parser.get<std::string>("output_dir");
    // Save disparity
    cv::FileStorage fs(output_dir + "/disparity.xml", cv::FileStorage::WRITE);
    fs << "disparity" << disparity;
    fs.release();
    // Convert disparity to 8-bit for visualization
    cv::Mat disparity_8u, disparity_color;
    disparity.convertTo(disparity_8u, CV_8U, 255.0 / disp_size);
    cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_TURBO);
    // Save colored disparity image
    cv::imwrite(output_dir + "/disparity_color.png", disparity_color);
    double total_end = cv::getTickCount();
    double total_time_s = (total_end - start_time) / cv::getTickFrequency();  // Seconds
    double total_time_ms = total_time_s * 1000.0;  // Milliseconds
    std::cout << "Total Execution Time: " << total_time_s << " s (" << total_time_ms << " ms)" << std::endl;
    // Display images
    const std::vector<cv::Mat> images = { disparity_8u, disparity_color, I1 };
    const std::vector<std::string> titles = { "Disparity", "Colored Disparity", "Input Image" };
    if (!parser.get<int>("no_display")) {
        std::cout << "Hot keys:\n";
        std::cout << "\tESC - Quit the program\n";
        std::cout << "\ts - Switch display (Disparity | Colored Disparity | Input Image)\n";
        int mode = 0;
        while (true) {
            cv::setWindowTitle("Image", titles[mode]);
            cv::imshow("Image", images[mode]);
            const char c = cv::waitKey(0);
            if (c == 's') mode = (mode < 2 ? mode + 1 : 0);
            if (c == 27) break;
        }
    }
    return 0;
 }
@@ -0,0 +1,120 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ @left_img   | <none> | path to input left image                                                            }"
 "{ @right_img  | <none> | path to input right image                                                           }"
 "{ disp_size   |     64 | maximum possible disparity value                                                    }"
 "{ P1          |     10 | penalty on the disparity change by plus or minus 1 between neighbor pixels          }"
 "{ P2          |    120 | penalty on the disparity change by more than 1 between neighbor pixels              }"
 "{ uniqueness  |   0.95 | margin in ratio by which the best cost function value should be at least second one }"
 "{ num_paths   |      8 | number of scanlines used in cost aggregation                                        }"
 "{ min_disp    |      0 | minimum disparity value                                                             }"
 "{ LR_max_diff |      1 | maximum allowed difference between left and right disparity                         }"
 "{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7)                      }"
 "{ help h      |        | display this help and exit                                                          }";
 int main(int argc, char* argv[]) {
    cv::CommandLineParser parser(argc, argv, keys);
    if (parser.has("help")) {
        parser.printMessage();
        return 0;
    }
    cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
    cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
    // Preprocessing: Convert images to grayscale if necessary
    if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
    if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
    // Ensure images have the same size by cropping
    int new_width = std::min(I1.cols, I2.cols);
    int new_height = std::min(I1.rows, I2.rows);
    I1 = I1(cv::Rect(0, 0, new_width, new_height));
    I2 = I2(cv::Rect(0, 0, new_width, new_height));
    const int disp_size = parser.get<int>("disp_size");
    const int P1 = parser.get<int>("P1");
    const int P2 = parser.get<int>("P2");
    const float uniqueness = parser.get<float>("uniqueness");
    const int num_paths = parser.get<int>("num_paths");
    const int min_disp = parser.get<int>("min_disp");
    const int LR_max_diff = parser.get<int>("LR_max_diff");
    const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
    if (!parser.check()) {
        parser.printErrors();
        parser.printMessage();
        std::exit(EXIT_FAILURE);
    }
    ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
    ASSERT_MSG(I1.size() == I2.size(), "input images must be the same size.");
    ASSERT_MSG(I1.type() == CV_8U, "input image format must be CV_8U.");
    ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
    ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
    const sgm::StereoSGM::Parameters param(P1, P2, uniqueness, false, sgm::PathType::SCAN_8PATH, min_disp, LR_max_diff, census_type);
    sgm::StereoSGM ssgm(I1.cols, I1.rows, disp_size, 8, 16, sgm::EXECUTE_INOUT_HOST2HOST, param);
    cv::Mat disparity(I1.size(), CV_16S);
    ssgm.execute(I1.data, I2.data, disparity.data);
    // Convert disparity to 8-bit and apply colormap
    cv::Mat disparity_8u, disparity_color;
    disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size);
    cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_TURBO);
    // Save disparity map
    cv::imwrite("disparity_map.png", disparity_8u);
    // Optionally save disparity values as a text file
    std::ofstream file("disparity_values.txt");
    if (file.is_open()) {
        for (int i = 0; i < disparity.rows; ++i) {
            for (int j = 0; j < disparity.cols; ++j) {
                file << static_cast<int>(disparity.at<int16_t>(i, j)) << " ";
            }
            file << "\n";
        }
        file.close();
    }
    std::cout << "Hot keys:\n\tESC - quit the program\n\ts - switch display (disparity | colored disparity | input image)\n";
    const std::vector<cv::Mat> images = { disparity_8u, disparity_color, I1 };
    const std::vector<std::string> titles = { "disparity", "disparity color", "input" };
    int mode = 0;
    while (true) {
        cv::setWindowTitle("image", titles[mode]);
        cv::imshow("image", images[mode]);
        char c = cv::waitKey(0);
        if (c == 's') mode = (mode + 1) % 3;
        if (c == 27) break;
    }
    return 0;
 }
@@ -0,0 +1,253 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <chrono>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 // Camera Parameters
 struct CameraParameters
 {
 	float fu;                 //!< focal length x (pixel)
 	float fv;                 //!< focal length y (pixel)
 	float u0;                 //!< principal point x (pixel)
 	float v0;                 //!< principal point y (pixel)
 	float baseline;           //!< baseline (meter)
 	float height;             //!< height position (meter), ignored when ROAD_ESTIMATION_AUTO
 	float tilt;               //!< tilt angle (radian), ignored when ROAD_ESTIMATION_AUTO
 };
 // Transformation between pixel coordinate and world coordinate
 struct CoordinateTransform
 {
 	CoordinateTransform(const CameraParameters& camera) : camera(camera)
 	{
 		sinTilt = sinf(camera.tilt);
 		cosTilt = cosf(camera.tilt);
 		bf = camera.baseline * camera.fu;
 		invfu = 1.f / camera.fu;
 		invfv = 1.f / camera.fv;
 	}
 	inline cv::Point3f imageToWorld(const cv::Point2f& pt, float d) const
 	{
 		const float u = pt.x;
 		const float v = pt.y;
 		const float Zc = bf / d;
 		const float Xc = invfu * (u - camera.u0) * Zc;
 		const float Yc = invfv * (v - camera.v0) * Zc;
 		const float Xw = Xc;
 		const float Yw = Yc * cosTilt + Zc * sinTilt;
 		const float Zw = Zc * cosTilt - Yc * sinTilt;
 		return cv::Point3f(Xw, Yw, Zw);
 	}
 	CameraParameters camera;
 	float sinTilt, cosTilt, bf, invfu, invfv;
 };
 void reprojectPointsTo3D(const cv::Mat& disparity, const CameraParameters& camera, std::vector<cv::Point3f>& points, bool subpixeled)
 {
 	CV_Assert(disparity.type() == CV_32F);
 	CoordinateTransform tf(camera);
 	points.clear();
 	points.reserve(disparity.rows * disparity.cols);
 	for (int y = 0; y < disparity.rows; y++)
 	{
 		for (int x = 0; x < disparity.cols; x++)
 		{
 			const float d = disparity.at<float>(y, x);
 			if (d > 0)
 				points.push_back(tf.imageToWorld(cv::Point(x, y), d));
 		}
 	}
 }
 static cv::Vec3b computeColor(float val)
 {
 	const float hscale = 6.f;
 	float h = 0.6f * (1.f - val), s = 1.f, v = 1.f;
 	static const int sector_data[][3] =
 	{ { 1,3,0 },{ 1,0,2 },{ 3,0,1 },{ 0,2,1 },{ 0,1,3 },{ 2,1,0 } };
 	float tab[4];
 	int sector;
 	h *= hscale;
 	if (h < 0)
 		do h += 6; while (h < 0);
 	else if (h >= 6)
 		do h -= 6; while (h >= 6);
 	sector = cvFloor(h);
 	h -= sector;
 	if ((unsigned)sector >= 6u)
 	{
 		sector = 0;
 		h = 0.f;
 	}
 	tab[0] = v;
 	tab[1] = v * (1.f - s);
 	tab[2] = v * (1.f - s * h);
 	tab[3] = v * (1.f - s * (1.f - h));
 	const uchar b = (uchar)(255 * tab[sector_data[sector][0]]);
 	const uchar g = (uchar)(255 * tab[sector_data[sector][1]]);
 	const uchar r = (uchar)(255 * tab[sector_data[sector][2]]);
 	return cv::Vec3b(b, g, r);
 }
 void drawPoints3D(const std::vector<cv::Point3f>& points, cv::Mat& draw)
 {
 	const int SIZE_X = 512;
 	const int SIZE_Z = 1024;
 	const int maxz = 20; // [meter]
 	const double pixelsPerMeter = 1. * SIZE_Z / maxz;
 	draw = cv::Mat::zeros(SIZE_Z, SIZE_X, CV_8UC3);
 	const int tableSize = 256;
 	const float scaleZ = 1.f * (tableSize - 1) / maxz;
 	static std::vector<cv::Vec3b> colorTable;
 	if (colorTable.empty())
 	{
 		colorTable.resize(tableSize);
 		for (int i = 0; i < tableSize; i++)
 			colorTable[i] = computeColor(1.f * i / tableSize);
 	}
 	for (const cv::Point3f& pt : points)
 	{
 		const float X = pt.x;
 		const float Z = pt.z;
 		const int u = cvRound(pixelsPerMeter * X) + SIZE_X / 2;
 		const int v = SIZE_Z - cvRound(pixelsPerMeter * Z);
 		const auto& color = colorTable[cvRound(scaleZ * std::min(Z, 1.f * maxz))];
 		cv::circle(draw, cv::Point(u, v), 1, color);
 	}
 }
 int main(int argc, char* argv[])
 {
 	if (argc < 4) {
 		std::cout << "usage: " << argv[0] << " left-image-format right-image-format camera.xml [disp_size] [subpixel_enable(0: false, 1:true)]" << std::endl;
 		std::exit(EXIT_FAILURE);
 	}
 	const int start_number = 1;
 	cv::Mat I1 = cv::imread(cv::format(argv[1], start_number), cv::IMREAD_UNCHANGED);
 	cv::Mat I2 = cv::imread(cv::format(argv[2], start_number), cv::IMREAD_UNCHANGED);
 	const cv::FileStorage fs(argv[3], cv::FileStorage::READ);
 	const int disp_size = argc >= 5 ? std::stoi(argv[4]) : 128;
 	const bool subpixel = argc >= 6 ? std::stoi(argv[5]) != 0 : true;
 	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
 	ASSERT_MSG(fs.isOpened(), "camera.xml read failed.");
 	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
 	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
 	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
 	// read camera parameters
 	CameraParameters camera;
 	camera.fu = fs["FocalLengthX"];
 	camera.fv = fs["FocalLengthY"];
 	camera.u0 = fs["CenterX"];
 	camera.v0 = fs["CenterY"];
 	camera.baseline = fs["BaseLine"];
 	camera.tilt = fs["Tilt"];
 	const int width = I1.cols;
 	const int height = I1.rows;
 	const int src_depth = I1.type() == CV_8U ? 8 : 16;
 	const int dst_depth = 16;
 	const int src_bytes = src_depth * width * height / 8;
 	const int dst_bytes = dst_depth * width * height / 8;
 	const sgm::StereoSGM::Parameters param(10, 120, 0.95f, subpixel);
 	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, param);
 	device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
 	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color, disparity_32f, draw;
 	std::vector<cv::Point3f> points;
 	const int invalid_disp = sgm.get_invalid_disparity();
 	const int disp_scale = subpixel ? sgm::StereoSGM::SUBPIXEL_SCALE : 1;
 	for (int frame_no = start_number;; frame_no++) {
 		I1 = cv::imread(cv::format(argv[1], frame_no), cv::IMREAD_UNCHANGED);
 		I2 = cv::imread(cv::format(argv[2], frame_no), cv::IMREAD_UNCHANGED);
 		if (I1.empty() || I2.empty()) {
 			frame_no = start_number - 1;
 			continue;
 		}
 		d_I1.upload(I1.data);
 		d_I2.upload(I2.data);
 		const auto t1 = std::chrono::system_clock::now();
 		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
 		cudaDeviceSynchronize();
 		const auto t2 = std::chrono::system_clock::now();
 		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
 		const double fps = 1e6 / duration;
 		d_disparity.download(disparity.data);
 		// reproject points
 		disparity.convertTo(disparity_32f, CV_32F, 1. / disp_scale);
 		reprojectPointsTo3D(disparity_32f, camera, points, subpixel);
 		// draw results
 		if (I1.type() != CV_8U)
 			cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
 		colorize_disparity(disparity, disparity_color, disp_scale * disp_size, disparity == invalid_disp);
 		cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
 			1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
 		drawPoints3D(points, draw);
 		cv::imshow("left image", I1);
 		cv::imshow("disparity", disparity_color);
 		cv::imshow("points", draw);
 		const char c = cv::waitKey(1);
 		if (c == 27) // ESC
 			break;
 	}
 	return 0;
 }
@@ -0,0 +1,114 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <iostream>
 #include <chrono>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
 #include <sl/Camera.hpp>
 #include <libsgm.h>
 #include "sample_common.h"
 static const std::string keys =
 "{ disp_size           |    128 | maximum possible disparity value                  }"
 "{ camera_resolution   |      3 | camera resolution (0:HD2K 1:HD1080 2:HD720 3:VGA) }"
 "{ help h              |        | display this help and exit                        }";
 int main(int argc, char* argv[])
 {
 	cv::CommandLineParser parser(argc, argv, keys);
 	if (parser.has("help")) {
 		parser.printMessage();
 		return 0;
 	}
 	const int disp_size = parser.get<int>("disp_size");
 	const sl::RESOLUTION camera_resolution = parser.get<sl::RESOLUTION>("camera_resolution");
 	sl::Camera zed;
 	sl::InitParameters initParameters;
 	initParameters.camera_resolution = camera_resolution;
 	const sl::ERROR_CODE err = zed.open(initParameters);
 	if (err != sl::ERROR_CODE::SUCCESS) {
 		std::cerr << sl::toString(err) << std::endl;
 		std::exit(EXIT_FAILURE);
 	}
 	const auto& resolution = zed.getCameraInformation().camera_configuration.resolution;
 	sl::Mat d_zed_image_L(resolution, sl::MAT_TYPE::U8_C1, sl::MEM::GPU);
 	sl::Mat d_zed_image_R(resolution, sl::MAT_TYPE::U8_C1, sl::MEM::GPU);
 	CV_Assert(d_zed_image_L.getStep(sl::MEM::GPU) == d_zed_image_R.getStep(sl::MEM::GPU));
 	const int width = resolution.width;
 	const int height = resolution.height;
 	const int src_pitch = static_cast<int>(d_zed_image_L.getStep(sl::MEM::GPU));
 	const int dst_pitch = width;
 	const int src_depth = 8;
 	const int dst_depth = disp_size < 256 ? 8 : 16;
 	const int src_bytes = src_depth * width * height / 8;
 	const int dst_bytes = dst_depth * width * height / 8;
 	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, src_pitch, dst_pitch, sgm::EXECUTE_INOUT_CUDA2CUDA);
 	device_buffer d_disparity(dst_bytes);
 	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color;
 	const int invalid_disp = sgm.get_invalid_disparity();
 	std::cout << "max disparity    : " << disp_size << std::endl;
 	std::cout << "camera resolution: " << sl::toString(initParameters.camera_resolution) << " " << cv::Size(width, height) << std::endl;
 	while (1) {
 		if (zed.grab() == sl::ERROR_CODE::SUCCESS) {
 			zed.retrieveImage(d_zed_image_L, sl::VIEW::LEFT_GRAY, sl::MEM::GPU);
 			zed.retrieveImage(d_zed_image_R, sl::VIEW::RIGHT_GRAY, sl::MEM::GPU);
 		}
 		else {
 			continue;
 		}
 		const auto t1 = std::chrono::system_clock::now();
 		sgm.execute(d_zed_image_L.getPtr<uchar>(sl::MEM::GPU), d_zed_image_R.getPtr<uchar>(sl::MEM::GPU), d_disparity.data);
 		cudaDeviceSynchronize();
 		const auto t2 = std::chrono::system_clock::now();
 		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
 		const double fps = 1e6 / duration;
 		d_disparity.download(disparity.data);
 		// draw results
 		colorize_disparity(disparity, disparity_color, disp_size, disparity == invalid_disp);
 		cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
 			1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
 		cv::imshow("disparity", disparity_color);
 		const char c = cv::waitKey(1);
 		if (c == 27) // ESC
 			break;
 	}
 	return 0;
 }
@@ -0,0 +1,55 @@
 cmake_minimum_required(VERSION 3.18)
 set(LIBSGM_ROOT_DIR ${PROJECT_SOURCE_DIR})
 set(LIBSGM_INCLUDE_DIR ${LIBSGM_ROOT_DIR}/include)
 # create project
 set(PROJECT_NAME sgm)
 project(${PROJECT_NAME} LANGUAGES CXX CUDA)
 # dependent packages
 find_package(CUDAToolkit REQUIRED)
 if(BUILD_OPENCV_WRAPPER)
 	find_package(OpenCV REQUIRED core)
 endif()
 # library type
 set(SGM_LIB_TYPE STATIC)
 if(LIBSGM_SHARED)
 	set(SGM_LIB_TYPE SHARED)
 endif()
 # target configuration
 file(GLOB SRCS ./*.cpp ./*.cu ./*.h* ${LIBSGM_INCLUDE_DIR}/*.h*)
 add_library(${PROJECT_NAME} ${SGM_LIB_TYPE})
 target_sources(${PROJECT_NAME} PRIVATE ${SRCS})
 target_include_directories(${PROJECT_NAME} PRIVATE ${LIBSGM_INCLUDE_DIR} $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_INCLUDE_DIRS}>)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} PUBLIC CUDA::cudart $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_LIBS}>)
 set_target_properties(${PROJECT_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${LIBSGM_INCLUDE_DIR})
 target_compile_options(${PROJECT_NAME} PRIVATE
 $<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall -O3>
 $<$<COMPILE_LANG_AND_ID:CXX,Clang>:-Wall -O3>
 $<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/wd4819>
 $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
 )
 install(
 	TARGETS ${PROJECT_NAME}
 	ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
 	LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
 	RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
 )
 install(
 	DIRECTORY ${LIBSGM_INCLUDE_DIR}
 	DESTINATION ${CMAKE_INSTALL_PREFIX}
 	FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp"
 )
 install(
 	FILES ${LIBSGM_ROOT_DIR}/FindLibSGM.cmake
 	DESTINATION ${CMAKE_INSTALL_PREFIX}
 )
@@ -0,0 +1,212 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "internal.h"
 #include <cuda_runtime.h>
 #include "types.h"
 #include "host_utility.h"
 namespace sgm
 {
 namespace
 {
 static constexpr int WINDOW_WIDTH  = 9;
 static constexpr int WINDOW_HEIGHT = 7;
 static constexpr int BLOCK_SIZE = 128;
 static constexpr int LINES_PER_BLOCK = 16;
 template <typename T>
 __global__ void census_transform_kernel(uint64_t* dest, const T* src, int width, int height, int pitch)
 {
 	using pixel_type = T;
 	using feature_type = uint64_t;
 	static const int SMEM_BUFFER_SIZE = WINDOW_HEIGHT + 1;
 	const int half_kw = WINDOW_WIDTH / 2;
 	const int half_kh = WINDOW_HEIGHT / 2;
 	__shared__ pixel_type smem_lines[SMEM_BUFFER_SIZE][BLOCK_SIZE];
 	const int tid = threadIdx.x;
 	const int x0 = blockIdx.x * (BLOCK_SIZE - WINDOW_WIDTH + 1) - half_kw;
 	const int y0 = blockIdx.y * LINES_PER_BLOCK;
 	for (int i = 0; i < WINDOW_HEIGHT; ++i) {
 		const int x = x0 + tid, y = y0 - half_kh + i;
 		pixel_type value = 0;
 		if (0 <= x && x < width && 0 <= y && y < height) {
 			value = src[x + y * pitch];
 		}
 		smem_lines[i][tid] = value;
 	}
 	__syncthreads();
 #pragma unroll
 	for (int i = 0; i < LINES_PER_BLOCK; ++i) {
 		if (i + 1 < LINES_PER_BLOCK) {
 			// Load to smem
 			const int x = x0 + tid, y = y0 + half_kh + i + 1;
 			pixel_type value = 0;
 			if (0 <= x && x < width && 0 <= y && y < height) {
 				value = src[x + y * pitch];
 			}
 			const int smem_x = tid;
 			const int smem_y = (WINDOW_HEIGHT + i) % SMEM_BUFFER_SIZE;
 			smem_lines[smem_y][smem_x] = value;
 		}
 		if (half_kw <= tid && tid < BLOCK_SIZE - half_kw) {
 			// Compute and store
 			const int x = x0 + tid, y = y0 + i;
 			if (half_kw <= x && x < width - half_kw && half_kh <= y && y < height - half_kh) {
 				const int smem_x = tid;
 				const int smem_y = (half_kh + i) % SMEM_BUFFER_SIZE;
 				const auto a = smem_lines[smem_y][smem_x];
 				feature_type f = 0;
 				for (int dy = -half_kh; dy <= half_kh; ++dy) {
 					for (int dx = -half_kw; dx <= half_kw; ++dx) {
 						if (dx != 0 && dy != 0) {
 							const int smem_y1 = (smem_y + dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
 							const int smem_x1 = smem_x + dx;
 							const auto b = smem_lines[smem_y1][smem_x1];
 							f = (f << 1) | (a > b);
 						}
 					}
 				}
 				dest[x + y * width] = f;
 			}
 		}
 		__syncthreads();
 	}
 }
 template <typename T>
 __global__ void symmetric_census_kernel(uint32_t* dest, const T* src, int width, int height, int pitch)
 {
 	using pixel_type = T;
 	using feature_type = uint32_t;
 	static const int SMEM_BUFFER_SIZE = WINDOW_HEIGHT + 1;
 	const int half_kw = WINDOW_WIDTH  / 2;
 	const int half_kh = WINDOW_HEIGHT / 2;
 	__shared__ pixel_type smem_lines[SMEM_BUFFER_SIZE][BLOCK_SIZE];
 	const int tid = threadIdx.x;
 	const int x0 = blockIdx.x * (BLOCK_SIZE - WINDOW_WIDTH + 1) - half_kw;
 	const int y0 = blockIdx.y * LINES_PER_BLOCK;
 	for(int i = 0; i < WINDOW_HEIGHT; ++i){
 		const int x = x0 + tid, y = y0 - half_kh + i;
 		pixel_type value = 0;
 		if(0 <= x && x < width && 0 <= y && y < height){
 			value = src[x + y * pitch];
 		}
 		smem_lines[i][tid] = value;
 	}
 	__syncthreads();
 #pragma unroll
 	for(int i = 0; i < LINES_PER_BLOCK; ++i){
 		if(i + 1 < LINES_PER_BLOCK){
 			// Load to smem
 			const int x = x0 + tid, y = y0 + half_kh + i + 1;
 			pixel_type value = 0;
 			if(0 <= x && x < width && 0 <= y && y < height){
 				value = src[x + y * pitch];
 			}
 			const int smem_x = tid;
 			const int smem_y = (WINDOW_HEIGHT + i) % SMEM_BUFFER_SIZE;
 			smem_lines[smem_y][smem_x] = value;
 		}
 		if(half_kw <= tid && tid < BLOCK_SIZE - half_kw){
 			// Compute and store
 			const int x = x0 + tid, y = y0 + i;
 			if(half_kw <= x && x < width - half_kw && half_kh <= y && y < height - half_kh){
 				const int smem_x = tid;
 				const int smem_y = (half_kh + i) % SMEM_BUFFER_SIZE;
 				feature_type f = 0;
 				for(int dy = -half_kh; dy < 0; ++dy){
 					const int smem_y1 = (smem_y + dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
 					const int smem_y2 = (smem_y - dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
 					for(int dx = -half_kw; dx <= half_kw; ++dx){
 						const int smem_x1 = smem_x + dx;
 						const int smem_x2 = smem_x - dx;
 						const auto a = smem_lines[smem_y1][smem_x1];
 						const auto b = smem_lines[smem_y2][smem_x2];
 						f = (f << 1) | (a > b);
 					}
 				}
 				for(int dx = -half_kw; dx < 0; ++dx){
 					const int smem_x1 = smem_x + dx;
 					const int smem_x2 = smem_x - dx;
 					const auto a = smem_lines[smem_y][smem_x1];
 					const auto b = smem_lines[smem_y][smem_x2];
 					f = (f << 1) | (a > b);
 				}
 				dest[x + y * width] = f;
 			}
 		}
 		__syncthreads();
 	}
 }
 } // namespace
 namespace details
 {
 void census_transform(const DeviceImage& src, DeviceImage& dst, CensusType type)
 {
 	const int w = src.cols;
 	const int h = src.rows;
 	const int w_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
 	const int h_per_block = LINES_PER_BLOCK;
 	const dim3 gdim(divUp(w, w_per_block), divUp(h, h_per_block));
 	const dim3 bdim(BLOCK_SIZE);
 	dst.create(h, w, type == CensusType::CENSUS_9x7 ? SGM_64U : SGM_32U);
 	if (type == CensusType::CENSUS_9x7) {
 		if (src.type == SGM_8U)
 			census_transform_kernel<<<gdim, bdim>>>(dst.ptr<uint64_t>(), src.ptr<uint8_t>(), w, h, src.step);
 		else if (src.type == SGM_16U)
 			census_transform_kernel<<<gdim, bdim>>>(dst.ptr<uint64_t>(), src.ptr<uint16_t>(), w, h, src.step);
 		else
 			census_transform_kernel<<<gdim, bdim>>>(dst.ptr<uint64_t>(), src.ptr<uint32_t>(), w, h, src.step);
 	}
 	else if (type == CensusType::SYMMETRIC_CENSUS_9x7) {
 		if (src.type == SGM_8U)
 			symmetric_census_kernel<<<gdim, bdim>>>(dst.ptr<uint32_t>(), src.ptr<uint8_t>(), w, h, src.step);
 		else if (src.type == SGM_16U)
 			symmetric_census_kernel<<<gdim, bdim>>>(dst.ptr<uint32_t>(), src.ptr<uint16_t>(), w, h, src.step);
 		else
 			symmetric_census_kernel<<<gdim, bdim>>>(dst.ptr<uint32_t>(), src.ptr<uint32_t>(), w, h, src.step);
 	}
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace details
 } // namespace sgm
@@ -0,0 +1,87 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "internal.h"
 #include <cuda_runtime.h>
 #include "constants.h"
 #include "host_utility.h"
 namespace
 {
 template<typename SRC_T, typename DST_T>
 __global__ void check_consistency_kernel(DST_T* dispL, const DST_T* dispR, const SRC_T* srcL, int width, int height, int src_pitch, int dst_pitch, bool subpixel, int LR_max_diff)
 {
 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 	if (x >= width || y >= height)
 		return;
 	// left-right consistency check, only on leftDisp, but could be done for rightDisp too
 	SRC_T mask = srcL[y * src_pitch + x];
 	DST_T org = dispL[y * dst_pitch + x];
 	int d = org;
 	if (subpixel) {
 		d >>= sgm::StereoSGM::SUBPIXEL_SHIFT;
 	}
 	const int k = x - d;
 	if (mask == 0 || org == sgm::INVALID_DISP || (k >= 0 && k < width && LR_max_diff >= 0 && abs(dispR[y * dst_pitch + k] - d) > LR_max_diff)) {
 		// masked or left-right inconsistent pixel -> invalid
 		dispL[y * dst_pitch + x] = static_cast<DST_T>(sgm::INVALID_DISP);
 	}
 }
 } // namespace
 namespace sgm
 {
 namespace details
 {
 void check_consistency(DeviceImage& dispL, const DeviceImage& dispR, const DeviceImage& srcL, bool subpixel, int LR_max_diff)
 {
 	SGM_ASSERT(dispL.type == SGM_16U && dispR.type == SGM_16U, "");
 	const int w = srcL.cols;
 	const int h = srcL.rows;
 	const dim3 block(16, 16);
 	const dim3 grid(divUp(w, block.x), divUp(h, block.y));
 	if (srcL.type == SGM_8U) {
 		using SRC_T = uint8_t;
 		check_consistency_kernel<SRC_T><<<grid, block>>>(dispL.ptr<uint16_t>(), dispR.ptr<uint16_t>(),
 			srcL.ptr<SRC_T>(), w, h, srcL.step, dispL.step, subpixel, LR_max_diff);
 	}
 	else if (srcL.type == SGM_16U) {
 		using SRC_T = uint16_t;
 		check_consistency_kernel<SRC_T><<<grid, block>>>(dispL.ptr<uint16_t>(), dispR.ptr<uint16_t>(),
 			srcL.ptr<SRC_T>(), w, h, srcL.step, dispL.step, subpixel, LR_max_diff);
 	}
 	else {
 		using SRC_T = uint32_t;
 		check_consistency_kernel<SRC_T><<<grid, block>>>(dispL.ptr<uint16_t>(), dispR.ptr<uint16_t>(),
 			srcL.ptr<SRC_T>(), w, h, srcL.step, dispL.step, subpixel, LR_max_diff);
 	}
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace details
 } // namespace sgm
@@ -0,0 +1,29 @@
 /*Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __CONSTANTS_H__
 #define __CONSTANTS_H__
 #include "types.h"
 namespace sgm
 {
 static constexpr unsigned int WARP_SIZE = 32u;
 static constexpr output_type INVALID_DISP = static_cast<output_type>(-1);
 } // namespace sgm
 #endif // !__CONSTANTS_H__
@@ -0,0 +1,73 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "internal.h"
 #include <cuda_runtime.h>
 #include "constants.h"
 #include "host_utility.h"
 namespace
 {
 __global__ void correct_disparity_range_kernel(uint16_t* d_disp, int width, int height, int pitch, int min_disp_scaled, int invalid_disp_scaled)
 {
 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 	if (x >= width || y >= height) {
 		return;
 	}
 	uint16_t d = d_disp[y * pitch + x];
 	if (d == sgm::INVALID_DISP) {
 		d = invalid_disp_scaled;
 	} else {
 		d += min_disp_scaled;
 	}
 	d_disp[y * pitch + x] = d;
 }
 } // namespace
 namespace sgm
 {
 namespace details
 {
 void correct_disparity_range(DeviceImage& disp, bool subpixel, int min_disp)
 {
 	if (!subpixel && min_disp == 0) {
 		return;
 	}
 	const int w = disp.cols;
 	const int h = disp.rows;
 	constexpr int SIZE = 16;
 	const dim3 blocks(divUp(w, SIZE), divUp(h, SIZE));
 	const dim3 threads(SIZE, SIZE);
 	const int scale = subpixel ? StereoSGM::SUBPIXEL_SCALE : 1;
 	const int     min_disp_scaled =  min_disp      * scale;
 	const int invalid_disp_scaled = (min_disp - 1) * scale;
 	correct_disparity_range_kernel<<<blocks, threads>>>(disp.ptr<uint16_t>(), w, h, disp.step, min_disp_scaled, invalid_disp_scaled);
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace details
 } // namespace sgm
@@ -0,0 +1,668 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "internal.h"
 #include <cuda_runtime.h>
 #include "device_utility.h"
 #include "host_utility.h"
 #if CUDA_VERSION >= 9000
 #define SHFL_UP(mask, var, delta, w) __shfl_up_sync((mask), (var), (delta), (w))
 #define SHFL_DOWN(mask, var, delta, w) __shfl_down_sync((mask), (var), (delta), (w))
 #else
 #define SHFL_UP(mask, var, delta, width) __shfl_up((var), (delta), (width))
 #define SHFL_DOWN(mask, var, delta, width) __shfl_down((var), (delta), (width))
 #endif
 namespace sgm
 {
 using COST_TYPE = cost_type;
 namespace cost_aggregation
 {
 template <typename T> __device__ inline int popcnt(T x) { return 0; }
 template <> __device__ inline int popcnt(uint32_t x) { return __popc(x); }
 template <> __device__ inline int popcnt(uint64_t x) { return __popcll(x); }
 template <unsigned int DP_BLOCK_SIZE, unsigned int SUBGROUP_SIZE>
 struct DynamicProgramming
 {
 	static_assert(DP_BLOCK_SIZE >= 2, "DP_BLOCK_SIZE must be greater than or equal to 2");
 	static_assert((SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0, "SUBGROUP_SIZE must be a power of 2");
 	uint32_t last_min;
 	uint32_t dp[DP_BLOCK_SIZE];
 	__device__ DynamicProgramming() : last_min(0)
 	{
 		for (unsigned int i = 0; i < DP_BLOCK_SIZE; ++i) { dp[i] = 0; }
 	}
 	__device__ void update(uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
 	{
 		const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
 		const auto dp0 = dp[0];
 		uint32_t lazy_out = 0, local_min = 0;
 		{
 			const unsigned int k = 0;
 			const uint32_t prev = SHFL_UP(mask, dp[DP_BLOCK_SIZE - 1], 1, WARP_SIZE);
 			uint32_t out = min(dp[k] - last_min, p2);
 			if (lane_id != 0) { out = min(out, prev - last_min + p1); }
 			out = min(out, dp[k + 1] - last_min + p1);
 			lazy_out = local_min = out + local_costs[k];
 		}
 		for (unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k) {
 			uint32_t out = min(dp[k] - last_min, p2);
 			out = min(out, dp[k - 1] - last_min + p1);
 			out = min(out, dp[k + 1] - last_min + p1);
 			dp[k - 1] = lazy_out;
 			lazy_out = out + local_costs[k];
 			local_min = min(local_min, lazy_out);
 		}
 		{
 			const unsigned int k = DP_BLOCK_SIZE - 1;
 			const uint32_t next = SHFL_DOWN(mask, dp0, 1, WARP_SIZE);
 			uint32_t out = min(dp[k] - last_min, p2);
 			out = min(out, dp[k - 1] - last_min + p1);
 			if (lane_id + 1 != SUBGROUP_SIZE) {
 				out = min(out, next - last_min + p1);
 			}
 			dp[k - 1] = lazy_out;
 			dp[k] = out + local_costs[k];
 			local_min = min(local_min, dp[k]);
 		}
 		last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
 	}
 };
 template <unsigned int SIZE>
 __device__ unsigned int generate_mask()
 {
 	static_assert(SIZE <= 32, "SIZE must be less than or equal to 32");
 	return static_cast<unsigned int>((1ull << SIZE) - 1u);
 }
 template <typename CENSUS_T>
 __device__ inline CENSUS_T load_census_with_check(const CENSUS_T* ptr, int x, int w)
 {
 	return x >= 0 && x < w ? __ldg(ptr + x) : 0;
 }
 namespace vertical
 {
 static constexpr unsigned int DP_BLOCK_SIZE = 16u;
 static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
 template <typename CENSUS_TYPE, int DIRECTION, unsigned int MAX_DISPARITY>
 __global__ void aggregate_vertical_path_kernel(
 	uint8_t *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK;
 	static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE;
 	static_assert(DIRECTION == 1 || DIRECTION == -1, "");
 	if (width == 0 || height == 0) {
 		return;
 	}
 	__shared__ CENSUS_TYPE right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS + 1];
 	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp;
 	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
 	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
 	const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
 	const unsigned int shfl_mask =
 		generate_mask<SUBGROUP_SIZE>() << (group_id * SUBGROUP_SIZE);
 	const unsigned int x =
 		blockIdx.x * PATHS_PER_BLOCK +
 		warp_id * PATHS_PER_WARP +
 		group_id;
 	const unsigned int right_x0 = blockIdx.x * PATHS_PER_BLOCK;
 	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
 	const unsigned int right0_addr =
 		(right_x0 + PATHS_PER_BLOCK - 1) - x + dp_offset;
 	const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE;
 	const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE;
 	for (unsigned int iter = 0; iter < height; ++iter) {
 		const unsigned int y = (DIRECTION > 0 ? iter : height - 1 - iter);
 		// Load left to register
 		CENSUS_TYPE left_value;
 		if (x < width) {
 			left_value = left[x + y * width];
 		}
 		// Load right to smem
 		for (unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE) {
 			const unsigned int i = i0 + threadIdx.x;
 			if (i < RIGHT_BUFFER_SIZE) {
 				const int right_x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i - min_disp);
 				const CENSUS_TYPE right_value = load_census_with_check(&right[y * width], right_x, width);
 				const unsigned int lo = i % DP_BLOCK_SIZE;
 				const unsigned int hi = i / DP_BLOCK_SIZE;
 				right_buffer[lo][hi] = right_value;
 				if (hi > 0) {
 					right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value;
 				}
 			}
 		}
 		__syncthreads();
 		// Compute
 		if (x < width) {
 			CENSUS_TYPE right_values[DP_BLOCK_SIZE];
 			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
 				right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi];
 			}
 			uint32_t local_costs[DP_BLOCK_SIZE];
 			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
 				local_costs[j] = popcnt(left_value ^ right_values[j]);
 			}
 			dp.update(local_costs, p1, p2, shfl_mask);
 			store_uint8_vector<DP_BLOCK_SIZE>(
 				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
 				dp.dp);
 		}
 		__syncthreads();
 	}
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_up2down(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<CENSUS_TYPE, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_down2up(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_vertical_path_kernel<CENSUS_TYPE, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace vertical
 namespace horizontal
 {
 static constexpr unsigned int DP_BLOCK_SIZE = 8u;
 static constexpr unsigned int DP_BLOCKS_PER_THREAD = 1u;
 static constexpr unsigned int WARPS_PER_BLOCK = 4u;
 static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK;
 template <typename CENSUS_TYPE, int DIRECTION, unsigned int MAX_DISPARITY>
 __global__ void aggregate_horizontal_path_kernel(
 	uint8_t *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
 	static const unsigned int PATHS_PER_WARP =
 		WARP_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
 	static const unsigned int PATHS_PER_BLOCK =
 		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
 	static_assert(DIRECTION == 1 || DIRECTION == -1, "");
 	if (width == 0 || height == 0) {
 		return;
 	}
 	CENSUS_TYPE right_buffer[DP_BLOCKS_PER_THREAD][DP_BLOCK_SIZE];
 	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp[DP_BLOCKS_PER_THREAD];
 	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
 	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
 	const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
 	const unsigned int shfl_mask =
 		generate_mask<SUBGROUP_SIZE>() << (group_id * SUBGROUP_SIZE);
 	const unsigned int y0 =
 		PATHS_PER_BLOCK * blockIdx.x +
 		PATHS_PER_WARP * warp_id +
 		group_id;
 	const unsigned int feature_step = SUBGROUPS_PER_WARP * width;
 	const unsigned int dest_step = SUBGROUPS_PER_WARP * MAX_DISPARITY * width;
 	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
 	left += y0 * width;
 	right += y0 * width;
 	dest += y0 * MAX_DISPARITY * width;
 	if (y0 >= height) {
 		return;
 	}
 	// initialize census buffer
 	{
 		const int x0 = (DIRECTION > 0 ? -1 : width) - (min_disp + static_cast<int>(dp_offset));
 		for (int dy = 0; dy < DP_BLOCKS_PER_THREAD; ++dy)
 			for (int dx = 0; dx < DP_BLOCK_SIZE; ++dx)
 				right_buffer[dy][dx] = load_census_with_check(&right[dy * feature_step], x0 - dx, width);
 	}
 	int x0 = (DIRECTION > 0) ? 0 : static_cast<int>((width - 1) & ~(DP_BLOCK_SIZE - 1));
 	for (unsigned int iter = 0; iter < width; iter += DP_BLOCK_SIZE) {
 		for (unsigned int i = 0; i < DP_BLOCK_SIZE; ++i) {
 			const unsigned int x = x0 + (DIRECTION > 0 ? i : (DP_BLOCK_SIZE - 1 - i));
 			if (x >= width) {
 				continue;
 			}
 			for (unsigned int j = 0; j < DP_BLOCKS_PER_THREAD; ++j) {
 				const unsigned int y = y0 + j * SUBGROUPS_PER_WARP;
 				if (y >= height) {
 					continue;
 				}
 				const CENSUS_TYPE left_value = __ldg(&left[j * feature_step + x]);
 				if (DIRECTION > 0) {
 					const CENSUS_TYPE t = right_buffer[j][DP_BLOCK_SIZE - 1];
 					for (unsigned int k = DP_BLOCK_SIZE - 1; k > 0; --k) {
 						right_buffer[j][k] = right_buffer[j][k - 1];
 					}
 					right_buffer[j][0] = SHFL_UP(shfl_mask, t, 1, SUBGROUP_SIZE);
 					if (lane_id == 0) {
 						right_buffer[j][0] = load_census_with_check(&right[j * feature_step], x - min_disp, width);
 					}
 				}
 				else {
 					const CENSUS_TYPE t = right_buffer[j][0];
 					for (unsigned int k = 1; k < DP_BLOCK_SIZE; ++k) {
 						right_buffer[j][k - 1] = right_buffer[j][k];
 					}
 					right_buffer[j][DP_BLOCK_SIZE - 1] = SHFL_DOWN(shfl_mask, t, 1, SUBGROUP_SIZE);
 					if (lane_id + 1 == SUBGROUP_SIZE) {
 						right_buffer[j][DP_BLOCK_SIZE - 1] = load_census_with_check(&right[j * feature_step], x - (min_disp + dp_offset + DP_BLOCK_SIZE - 1), width);
 					}
 				}
 				uint32_t local_costs[DP_BLOCK_SIZE];
 				for (unsigned int k = 0; k < DP_BLOCK_SIZE; ++k) {
 					local_costs[k] = popcnt(left_value ^ right_buffer[j][k]);
 				}
 				dp[j].update(local_costs, p1, p2, shfl_mask);
 				store_uint8_vector<DP_BLOCK_SIZE>(
 					&dest[j * dest_step + x * MAX_DISPARITY + dp_offset],
 					dp[j].dp);
 			}
 		}
 		x0 += static_cast<int>(DP_BLOCK_SIZE) * DIRECTION;
 	}
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_left2right(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK =
 		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<CENSUS_TYPE, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_right2left(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK =
 		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
 	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_horizontal_path_kernel<CENSUS_TYPE, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace horizontal
 namespace oblique
 {
 static constexpr unsigned int DP_BLOCK_SIZE = 16u;
 static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
 template <typename CENSUS_TYPE, int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY>
 __global__ void aggregate_oblique_path_kernel(
 	uint8_t *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK;
 	static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE;
 	static_assert(X_DIRECTION == 1 || X_DIRECTION == -1, "");
 	static_assert(Y_DIRECTION == 1 || Y_DIRECTION == -1, "");
 	if (width == 0 || height == 0) {
 		return;
 	}
 	__shared__ CENSUS_TYPE right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS];
 	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp;
 	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
 	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
 	const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
 	const unsigned int shfl_mask =
 		generate_mask<SUBGROUP_SIZE>() << (group_id * SUBGROUP_SIZE);
 	const int x0 =
 		blockIdx.x * PATHS_PER_BLOCK +
 		warp_id * PATHS_PER_WARP +
 		group_id +
 		(X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0);
 	const int right_x00 =
 		blockIdx.x * PATHS_PER_BLOCK +
 		(X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0);
 	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
 	const unsigned int right0_addr =
 		static_cast<unsigned int>(right_x00 + PATHS_PER_BLOCK - 1 - x0) + dp_offset;
 	const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE;
 	const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE;
 	for (unsigned int iter = 0; iter < height; ++iter) {
 		const int y = static_cast<int>(Y_DIRECTION > 0 ? iter : height - 1 - iter);
 		const int x = x0 + static_cast<int>(iter) * X_DIRECTION;
 		const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION;
 		// Load right to smem
 		for (unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE) {
 			const unsigned int i = i0 + threadIdx.x;
 			if (i < RIGHT_BUFFER_SIZE) {
 				const int right_x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i - min_disp);
 				const CENSUS_TYPE right_value = load_census_with_check(&right[y * width], right_x, width);
 				const unsigned int lo = i % DP_BLOCK_SIZE;
 				const unsigned int hi = i / DP_BLOCK_SIZE;
 				right_buffer[lo][hi] = right_value;
 				if (hi > 0) {
 					right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value;
 				}
 			}
 		}
 		__syncthreads();
 		// Compute
 		if (0 <= x && x < static_cast<int>(width)) {
 			const CENSUS_TYPE left_value = __ldg(&left[x + y * width]);
 			CENSUS_TYPE right_values[DP_BLOCK_SIZE];
 			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
 				right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi];
 			}
 			uint32_t local_costs[DP_BLOCK_SIZE];
 			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
 				local_costs[j] = popcnt(left_value ^ right_values[j]);
 			}
 			dp.update(local_costs, p1, p2, shfl_mask);
 			store_uint8_vector<DP_BLOCK_SIZE>(
 				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
 				dp.dp);
 		}
 		__syncthreads();
 	}
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_upleft2downright(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<CENSUS_TYPE, 1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_upright2downleft(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<CENSUS_TYPE, -1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_downright2upleft(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<CENSUS_TYPE, -1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
 void aggregate_downleft2upright(
 	COST_TYPE *dest,
 	const CENSUS_TYPE *left,
 	const CENSUS_TYPE *right,
 	int width,
 	int height,
 	unsigned int p1,
 	unsigned int p2,
 	int min_disp,
 	cudaStream_t stream)
 {
 	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
 	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
 	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
 	const int bdim = BLOCK_SIZE;
 	aggregate_oblique_path_kernel<CENSUS_TYPE, 1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
 		dest, left, right, width, height, p1, p2, min_disp);
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace oblique
 } // namespace cost_aggregation
 namespace details
 {
 template <typename CENSUS_TYPE, int MAX_DISPARITY>
 void cost_aggregation_(const DeviceImage& srcL, const DeviceImage& srcR, DeviceImage& dst,
 	int P1, int P2, PathType path_type, int min_disp)
 {
 	const int width = srcL.cols;
 	const int height = srcL.rows;
 	const int num_paths = path_type == PathType::SCAN_4PATH ? 4 : 8;
 	dst.create(num_paths, height * width * MAX_DISPARITY, SGM_8U);
 	const CENSUS_TYPE* left = srcL.ptr<CENSUS_TYPE>();
 	const CENSUS_TYPE* right = srcR.ptr<CENSUS_TYPE>();
 	cudaStream_t streams[8];
 	for (int i = 0; i < num_paths; i++)
 		cudaStreamCreate(&streams[i]);
 	cost_aggregation::vertical::aggregate_up2down<CENSUS_TYPE, MAX_DISPARITY>(
 		dst.ptr<COST_TYPE>(0), left, right, width, height, P1, P2, min_disp, streams[0]);
 	cost_aggregation::vertical::aggregate_down2up<CENSUS_TYPE, MAX_DISPARITY>(
 		dst.ptr<COST_TYPE>(1), left, right, width, height, P1, P2, min_disp, streams[1]);
 	cost_aggregation::horizontal::aggregate_left2right<CENSUS_TYPE, MAX_DISPARITY>(
 		dst.ptr<COST_TYPE>(2), left, right, width, height, P1, P2, min_disp, streams[2]);
 	cost_aggregation::horizontal::aggregate_right2left<CENSUS_TYPE, MAX_DISPARITY>(
 		dst.ptr<COST_TYPE>(3), left, right, width, height, P1, P2, min_disp, streams[3]);
 	if (path_type == PathType::SCAN_8PATH) {
 		cost_aggregation::oblique::aggregate_upleft2downright<CENSUS_TYPE, MAX_DISPARITY>(
 			dst.ptr<COST_TYPE>(4), left, right, width, height, P1, P2, min_disp, streams[4]);
 		cost_aggregation::oblique::aggregate_upright2downleft<CENSUS_TYPE, MAX_DISPARITY>(
 			dst.ptr<COST_TYPE>(5), left, right, width, height, P1, P2, min_disp, streams[5]);
 		cost_aggregation::oblique::aggregate_downright2upleft<CENSUS_TYPE, MAX_DISPARITY>(
 			dst.ptr<COST_TYPE>(6), left, right, width, height, P1, P2, min_disp, streams[6]);
 		cost_aggregation::oblique::aggregate_downleft2upright<CENSUS_TYPE, MAX_DISPARITY>(
 			dst.ptr<COST_TYPE>(7), left, right, width, height, P1, P2, min_disp, streams[7]);
 	}
 	for (int i = 0; i < num_paths; i++)
 		cudaStreamSynchronize(streams[i]);
 	for (int i = 0; i < num_paths; i++)
 		cudaStreamDestroy(streams[i]);
 }
 void cost_aggregation(const DeviceImage& srcL, const DeviceImage& srcR, DeviceImage& dst,
 	int disp_size, int P1, int P2, PathType path_type, int min_disp)
 {
 	SGM_ASSERT(srcL.type == srcR.type, "left and right image type must be same.");
 	if (srcL.type == SGM_32U) {
 		if (disp_size == 64) {
 			cost_aggregation_<uint32_t, 64>(srcL, srcR, dst, P1, P2, path_type, min_disp);
 		}
 		else if (disp_size == 128) {
 			cost_aggregation_<uint32_t, 128>(srcL, srcR, dst, P1, P2, path_type, min_disp);
 		}
 		else if (disp_size == 256) {
 			cost_aggregation_<uint32_t, 256>(srcL, srcR, dst, P1, P2, path_type, min_disp);
 		}
 	}
 	else if (srcL.type == SGM_64U) {
 		if (disp_size == 64) {
 			cost_aggregation_<uint64_t, 64>(srcL, srcR, dst, P1, P2, path_type, min_disp);
 		}
 		else if (disp_size == 128) {
 			cost_aggregation_<uint64_t, 128>(srcL, srcR, dst, P1, P2, path_type, min_disp);
 		}
 		else if (disp_size == 256) {
 			cost_aggregation_<uint64_t, 256>(srcL, srcR, dst, P1, P2, path_type, min_disp);
 		}
 	}
 }
 } // namespace details
 } // namespace sgm
@@ -0,0 +1,76 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "internal.h"
 #include <cuda_runtime.h>
 #include "host_utility.h"
 namespace
 {
 __global__ void cast_16bit_8bit_array_kernel(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements)
 {
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < num_elements)
 		arr8bits[i] = static_cast<uint8_t>(arr16bits[i]);
 }
 __global__ void cast_8bit_16bit_array_kernel(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements)
 {
 	const int i = blockIdx.x * blockDim.x + threadIdx.x;
 	if (i < num_elements)
 		arr16bits[i] = static_cast<uint16_t>(arr8bits[i]);
 }
 } // namespace
 namespace sgm
 {
 namespace details
 {
 void cast_16bit_to_8bit(const DeviceImage& src, DeviceImage& dst)
 {
 	const int w = src.cols;
 	const int h = src.rows;
 	dst.create(h, w, SGM_8U, src.step);
 	const int num_elements = h * src.step;
 	const int block = 1024;
 	const int grid = divUp(num_elements, block);
 	cast_16bit_8bit_array_kernel<<<grid, block>>>(src.ptr<uint16_t>(), dst.ptr<uint8_t>(), num_elements);
 	CUDA_CHECK(cudaGetLastError());
 }
 void cast_8bit_to_16bit(const DeviceImage& src, DeviceImage& dst)
 {
 	const int w = src.cols;
 	const int h = src.rows;
 	dst.create(h, w, SGM_16U, src.step);
 	const int num_elements = h * src.step;
 	const int block = 1024;
 	const int grid = divUp(num_elements, block);
 	cast_8bit_16bit_array_kernel<<<grid, block>>>(src.ptr<uint8_t>(), dst.ptr<uint16_t>(), num_elements);
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace details
 } // namespace sgm
@@ -0,0 +1,110 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "device_allocator.h"
 #include <cuda_runtime.h>
 #include "host_utility.h"
 namespace sgm
 {
 DeviceAllocator::DeviceAllocator() : data_(nullptr), ref_count_(nullptr), capacity_(0)
 {
 }
 DeviceAllocator::DeviceAllocator(const DeviceAllocator& other)
 {
 	copy_construct_from(other);
 }
 DeviceAllocator::DeviceAllocator(DeviceAllocator&& right)
 {
 	move_construct_from(std::move(right));
 }
 DeviceAllocator::~DeviceAllocator()
 {
 	release();
 }
 void* DeviceAllocator::allocate(size_t size)
 {
 	if (size > capacity_)
 	{
 		release();
 		CUDA_CHECK(cudaMalloc(&data_, size));
 		ref_count_ = new int(1);
 		capacity_ = size;
 	}
 	return data_;
 }
 void DeviceAllocator::assign(void* data, size_t size)
 {
 	release();
 	data_ = data;
 	capacity_ = size;
 }
 void DeviceAllocator::release()
 {
 	if (ref_count_ && --(*ref_count_) == 0)
 	{
 		CUDA_CHECK(cudaFree(data_));
 		delete ref_count_;
 	}
 	data_ = ref_count_ = nullptr;
 	capacity_ = 0;
 }
 DeviceAllocator& DeviceAllocator::operator=(const DeviceAllocator& other)
 {
 	release();
 	copy_construct_from(other);
 	return *this;
 }
 DeviceAllocator& DeviceAllocator::operator=(DeviceAllocator&& right)
 {
 	release();
 	move_construct_from(std::move(right));
 	return *this;
 }
 void DeviceAllocator::copy_construct_from(const DeviceAllocator& other)
 {
 	data_ = other.data_;
 	ref_count_ = other.ref_count_;
 	capacity_ = other.capacity_;
 	if (ref_count_)
 		(*ref_count_)++;
 }
 void DeviceAllocator::move_construct_from(DeviceAllocator&& right)
 {
 	data_ = right.data_;
 	ref_count_ = right.ref_count_;
 	capacity_ = right.capacity_;
 	right.data_ = right.ref_count_ = nullptr;
 	right.capacity_ = 0;
 }
 } // namespace sgm
@@ -0,0 +1,52 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __DEVICE_ALLOCATOR_H__
 #define __DEVICE_ALLOCATOR_H__
 #include <cstddef>
 namespace sgm
 {
 class DeviceAllocator
 {
 public:
 	DeviceAllocator();
 	DeviceAllocator(const DeviceAllocator& other);
 	DeviceAllocator(DeviceAllocator&& right);
 	~DeviceAllocator();
 	void* allocate(size_t size);
 	void assign(void* data, size_t size);
 	void release();
 	DeviceAllocator& operator=(const DeviceAllocator& other);
 	DeviceAllocator& operator=(DeviceAllocator&& right);
 private:
 	void copy_construct_from(const DeviceAllocator& other);
 	void move_construct_from(DeviceAllocator&& right);
 	void* data_;
 	int* ref_count_;
 	size_t capacity_;
 };
 } // namespace sgm
 #endif // !__DEVICE_ALLOCATOR_H__
@@ -0,0 +1,93 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "device_image.h"
 #include <cuda_runtime.h>
 #include "host_utility.h"
 namespace sgm
 {
 static size_t elemSize(ImageType type)
 {
 	if (type == SGM_8U)
 		return 1;
 	if (type == SGM_16U)
 		return 2;
 	if (type == SGM_32U)
 		return 4;
 	if (type == SGM_64U)
 		return 8;
 	return 0;
 }
 DeviceImage::DeviceImage() : data(nullptr), rows(0), cols(0), step(0), type(SGM_8U)
 {
 }
 DeviceImage::DeviceImage(int rows, int cols, ImageType type, int step)
 {
 	create(rows, cols, type, step);
 }
 DeviceImage::DeviceImage(void* data, int rows, int cols, ImageType type, int step)
 {
 	create(data, rows, cols, type, step);
 }
 void DeviceImage::create(int _rows, int _cols, ImageType _type, int _step)
 {
 	if (_step < 0)
 		_step = _cols;
 	data = allocator_.allocate(elemSize(_type) * _rows * _step);
 	rows = _rows;
 	cols = _cols;
 	step = _step;
 	type = _type;
 }
 void DeviceImage::create(void* _data, int _rows, int _cols, ImageType _type, int _step)
 {
 	if (_step < 0)
 		_step = _cols;
 	allocator_.assign(_data, elemSize(_type) * _rows * _step);
 	data = _data;
 	rows = _rows;
 	cols = _cols;
 	step = _step;
 	type = _type;
 }
 void DeviceImage::upload(const void* _data)
 {
 	CUDA_CHECK(cudaMemcpy(data, _data, elemSize(type) * rows * step, cudaMemcpyHostToDevice));
 }
 void DeviceImage::download(void* _data) const
 {
 	CUDA_CHECK(cudaMemcpy(_data, data, elemSize(type) * rows * step, cudaMemcpyDeviceToHost));
 }
 void DeviceImage::fill_zero()
 {
 	CUDA_CHECK(cudaMemset(data, 0, elemSize(type) * rows * step));
 }
 } // namespace sgm
@@ -0,0 +1,62 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __DEVICE_IMAGE_H__
 #define __DEVICE_IMAGE_H__
 #include "device_allocator.h"
 namespace sgm
 {
 enum ImageType
 {
 	SGM_8U,
 	SGM_16U,
 	SGM_32U,
 	SGM_64U,
 };
 class DeviceImage
 {
 public:
 	DeviceImage();
 	DeviceImage(int rows, int cols, ImageType type, int step = -1);
 	DeviceImage(void* data, int rows, int cols, ImageType type, int step = -1);
 	void create(int rows, int cols, ImageType type, int step = -1);
 	void create(void* data, int rows, int cols, ImageType type, int step = -1);
 	void upload(const void* data);
 	void download(void* data) const;
 	void fill_zero();
 	template <typename T> T* ptr(int y = 0) { return (T*)data + y * (size_t)step; }
 	template <typename T> const T* ptr(int y = 0) const { return (T*)data + y * (size_t)step; }
 	void* data;
 	int rows, cols, step;
 	ImageType type;
 private:
 	DeviceAllocator allocator_;
 };
 } // namespace sgm
 #endif // !__DEVICE_IMAGE_H__
@@ -0,0 +1,283 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __DEVICE_UTILITY_H__
 #define __DEVICE_UTILITY_H__
 #include <cuda.h>
 #include "types.h"
 #include "constants.h"
 namespace sgm
 {
 namespace detail
 {
 template <typename T, unsigned int GROUP_SIZE, unsigned int STEP>
 struct subgroup_min_impl
 {
 	static __device__ T call(T x, uint32_t mask)
 	{
 #if CUDA_VERSION >= 9000
 		x = min(x, __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE));
 #else
 		x = min(x, __shfl_xor(x, STEP / 2, GROUP_SIZE));
 #endif
 		return subgroup_min_impl<T, GROUP_SIZE, STEP / 2>::call(x, mask);
 	}
 };
 template <typename T, unsigned int GROUP_SIZE>
 struct subgroup_min_impl<T, GROUP_SIZE, 1u>
 {
 	static __device__ T call(T x, uint32_t)
 	{
 		return x;
 	}
 };
 template <unsigned int GROUP_SIZE, unsigned int STEP>
 struct subgroup_and_impl
 {
 	static __device__ bool call(bool x, uint32_t mask)
 	{
 #if CUDA_VERSION >= 9000
 		x &= __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE);
 #else
 		x &= __shfl_xor(x, STEP / 2, GROUP_SIZE);
 #endif
 		return subgroup_and_impl<GROUP_SIZE, STEP / 2>::call(x, mask);
 	}
 };
 template <unsigned int GROUP_SIZE>
 struct subgroup_and_impl<GROUP_SIZE, 1u>
 {
 	static __device__ bool call(bool x, uint32_t)
 	{
 		return x;
 	}
 };
 } // namespace detail
 template <unsigned int GROUP_SIZE, typename T>
 __device__ inline T subgroup_min(T x, uint32_t mask)
 {
 	return detail::subgroup_min_impl<T, GROUP_SIZE, GROUP_SIZE>::call(x, mask);
 }
 template <unsigned int GROUP_SIZE>
 __device__ inline bool subgroup_and(bool x, uint32_t mask)
 {
 	return detail::subgroup_and_impl<GROUP_SIZE, GROUP_SIZE>::call(x, mask);
 }
 template <typename T, typename S>
 __device__ inline T load_as(const S *p)
 {
 	return *reinterpret_cast<const T *>(p);
 }
 template <typename T, typename S>
 __device__ inline void store_as(S *p, const T& x)
 {
 	*reinterpret_cast<T *>(p) = x;
 }
 template <typename T>
 __device__ inline uint32_t pack_uint8x4(T x, T y, T z, T w)
 {
 	uchar4 uint8x4;
 	uint8x4.x = static_cast<uint8_t>(x);
 	uint8x4.y = static_cast<uint8_t>(y);
 	uint8x4.z = static_cast<uint8_t>(z);
 	uint8x4.w = static_cast<uint8_t>(w);
 	return load_as<uint32_t>(&uint8x4);
 }
 template <unsigned int N>
 __device__ inline void load_uint8_vector(uint32_t *dest, const uint8_t *ptr);
 template <>
 __device__ inline void load_uint8_vector<1u>(uint32_t *dest, const uint8_t *ptr)
 {
 	dest[0] = static_cast<uint32_t>(ptr[0]);
 }
 template <>
 __device__ inline void load_uint8_vector<2u>(uint32_t *dest, const uint8_t *ptr)
 {
 	const auto uint8x2 = load_as<uchar2>(ptr);
 	dest[0] = uint8x2.x; dest[1] = uint8x2.y;
 }
 template <>
 __device__ inline void load_uint8_vector<4u>(uint32_t *dest, const uint8_t *ptr)
 {
 	const auto uint8x4 = load_as<uchar4>(ptr);
 	dest[0] = uint8x4.x; dest[1] = uint8x4.y; dest[2] = uint8x4.z; dest[3] = uint8x4.w;
 }
 template <>
 __device__ inline void load_uint8_vector<8u>(uint32_t *dest, const uint8_t *ptr)
 {
 	const auto uint32x2 = load_as<uint2>(ptr);
 	load_uint8_vector<4u>(dest + 0, reinterpret_cast<const uint8_t *>(&uint32x2.x));
 	load_uint8_vector<4u>(dest + 4, reinterpret_cast<const uint8_t *>(&uint32x2.y));
 }
 template <>
 __device__ inline void load_uint8_vector<16u>(uint32_t *dest, const uint8_t *ptr)
 {
 	const auto uint32x4 = load_as<uint4>(ptr);
 	load_uint8_vector<4u>(dest +  0, reinterpret_cast<const uint8_t *>(&uint32x4.x));
 	load_uint8_vector<4u>(dest +  4, reinterpret_cast<const uint8_t *>(&uint32x4.y));
 	load_uint8_vector<4u>(dest +  8, reinterpret_cast<const uint8_t *>(&uint32x4.z));
 	load_uint8_vector<4u>(dest + 12, reinterpret_cast<const uint8_t *>(&uint32x4.w));
 }
 template <unsigned int N>
 __device__ inline void store_uint8_vector(uint8_t *dest, const uint32_t *ptr);
 template <>
 __device__ inline void store_uint8_vector<1u>(uint8_t *dest, const uint32_t *ptr)
 {
 	dest[0] = static_cast<uint8_t>(ptr[0]);
 }
 template <>
 __device__ inline void store_uint8_vector<2u>(uint8_t *dest, const uint32_t *ptr)
 {
 	uchar2 uint8x2;
 	uint8x2.x = static_cast<uint8_t>(ptr[0]);
 	uint8x2.y = static_cast<uint8_t>(ptr[1]);
 	store_as<uchar2>(dest, uint8x2);
 }
 template <>
 __device__ inline void store_uint8_vector<4u>(uint8_t *dest, const uint32_t *ptr)
 {
 	store_as<uint32_t>(dest, pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]));
 }
 template <>
 __device__ inline void store_uint8_vector<8u>(uint8_t *dest, const uint32_t *ptr)
 {
 	uint2 uint32x2;
 	uint32x2.x = pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]);
 	uint32x2.y = pack_uint8x4(ptr[4], ptr[5], ptr[6], ptr[7]);
 	store_as<uint2>(dest, uint32x2);
 }
 template <>
 __device__ inline void store_uint8_vector<16u>(uint8_t *dest, const uint32_t *ptr)
 {
 	uint4 uint32x4;
 	uint32x4.x = pack_uint8x4(ptr[ 0], ptr[ 1], ptr[ 2], ptr[ 3]);
 	uint32x4.y = pack_uint8x4(ptr[ 4], ptr[ 5], ptr[ 6], ptr[ 7]);
 	uint32x4.z = pack_uint8x4(ptr[ 8], ptr[ 9], ptr[10], ptr[11]);
 	uint32x4.w = pack_uint8x4(ptr[12], ptr[13], ptr[14], ptr[15]);
 	store_as<uint4>(dest, uint32x4);
 }
 template <unsigned int N>
 __device__ inline void load_uint16_vector(uint32_t *dest, const uint16_t *ptr);
 template <>
 __device__ inline void load_uint16_vector<1u>(uint32_t *dest, const uint16_t *ptr)
 {
 	dest[0] = static_cast<uint32_t>(ptr[0]);
 }
 template <>
 __device__ inline void load_uint16_vector<2u>(uint32_t *dest, const uint16_t *ptr)
 {
 	const auto uint16x2 = load_as<ushort2>(ptr);
 	dest[0] = uint16x2.x; dest[1] = uint16x2.y;
 }
 template <>
 __device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *ptr)
 {
 	const auto uint16x4 = load_as<ushort4>(ptr);
 	dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w;
 }
 template <>
 __device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr)
 {
 	const auto uint32x4 = load_as<uint4>(ptr);
 	load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x4.x));
 	load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x4.y));
 	load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x4.z));
 	load_uint16_vector<2u>(dest + 6, reinterpret_cast<const uint16_t *>(&uint32x4.w));
 }
 template <unsigned int N>
 __device__ inline void store_uint16_vector(uint16_t *dest, const uint32_t *ptr);
 template <>
 __device__ inline void store_uint16_vector<1u>(uint16_t *dest, const uint32_t *ptr)
 {
 	dest[0] = static_cast<uint16_t>(ptr[0]);
 }
 template <>
 __device__ inline void store_uint16_vector<2u>(uint16_t *dest, const uint32_t *ptr)
 {
 	ushort2 uint16x2;
 	uint16x2.x = static_cast<uint16_t>(ptr[0]);
 	uint16x2.y = static_cast<uint16_t>(ptr[1]);
 	store_as<ushort2>(dest, uint16x2);
 }
 template <>
 __device__ inline void store_uint16_vector<4u>(uint16_t *dest, const uint32_t *ptr)
 {
 	ushort4 uint16x4;
 	uint16x4.x = static_cast<uint16_t>(ptr[0]);
 	uint16x4.y = static_cast<uint16_t>(ptr[1]);
 	uint16x4.z = static_cast<uint16_t>(ptr[2]);
 	uint16x4.w = static_cast<uint16_t>(ptr[3]);
 	store_as<ushort4>(dest, uint16x4);
 }
 template <>
 __device__ inline void store_uint16_vector<8u>(uint16_t *dest, const uint32_t *ptr)
 {
 	uint4 uint32x4;
 	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.x), &ptr[0]);
 	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.y), &ptr[2]);
 	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.z), &ptr[4]);
 	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.w), &ptr[6]);
 	store_as<uint4>(dest, uint32x4);
 }
 template <>
 __device__ inline void store_uint16_vector<16u>(uint16_t *dest, const uint32_t *ptr)
 {
 	store_uint16_vector<8u>(dest + 0, ptr + 0);
 	store_uint16_vector<8u>(dest + 8, ptr + 8);
 }
 } // namespace sgm
 #endif // !__DEVICE_UTILITY_H__
@@ -0,0 +1,45 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __HOST_UTILITY_H__
 #define __HOST_UTILITY_H__
 #include <cstdio>
 #include <stdexcept>
 #define CUDA_CHECK(err) \
 do {\
 	if (err != cudaSuccess) { \
 		printf("[CUDA Error] %s (code: %d) at %s:%d\n", cudaGetErrorString(err), err, __FILE__, __LINE__); \
 	} \
 } while (0)
 #define SGM_ASSERT(expr, msg) \
 if (!(expr)) { \
 	throw std::logic_error(msg); \
 } \
 namespace sgm
 {
 static inline int divUp(int total, int grain)
 {
 	return (total + grain - 1) / grain;
 }
 } // namespace sgm
 #endif // !__HOST_UTILITY_H__
@@ -0,0 +1,48 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #ifndef __INTERNAL_H__
 #define __INTERNAL_H__
 #include "libsgm.h"
 #include "device_image.h"
 namespace sgm
 {
 namespace details
 {
 void census_transform(const DeviceImage& src, DeviceImage& dst, CensusType type);
 void cost_aggregation(const DeviceImage& srcL, const DeviceImage& srcR, DeviceImage& dst,
 	int disp_size, int P1, int P2, PathType path_type, int min_disp);
 void winner_takes_all(const DeviceImage& src, DeviceImage& dstL, DeviceImage& dstR,
 	int disp_size, float uniqueness, bool subpixel, PathType path_type);
 void median_filter(const DeviceImage& src, DeviceImage& dst);
 void check_consistency(DeviceImage& dispL, const DeviceImage& dispR, const DeviceImage& srcL, bool subpixel, int LR_max_diff);
 void correct_disparity_range(DeviceImage& disp, bool subpixel, int min_disp);
 void cast_16bit_to_8bit(const DeviceImage& src, DeviceImage& dst);
 void cast_8bit_to_16bit(const DeviceImage& src, DeviceImage& dst);
 } // namespace details
 } // namespace sgm
 #endif // !__INTERNAL_H__
@@ -0,0 +1,218 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <libsgm.h>
 #include <iostream>
 #include "internal.h"
 #include "host_utility.h"
 namespace sgm
 {
 static bool has_enough_depth(int dst_depth, int disparity_size, int min_disp, bool subpixel)
 {
 	// simulate minimum/maximum value
 	int64_t max = static_cast<int64_t>(disparity_size) + min_disp - 1;
 	if (subpixel) {
 		max *= sgm::StereoSGM::SUBPIXEL_SCALE;
 		max += sgm::StereoSGM::SUBPIXEL_SCALE - 1;
 	}
 	if (1ll << dst_depth <= max)
 		return false;
 	if (min_disp <= 0) {
 		// whether or not output can be represented by signed
 		int64_t min = static_cast<int64_t>(min_disp) - 1;
 		if (subpixel) {
 			min *= sgm::StereoSGM::SUBPIXEL_SCALE;
 		}
 		if (min < -(1ll << (dst_depth - 1))
 			|| 1ll << (dst_depth - 1) <= max)
 			return false;
 	}
 	return true;
 }
 class StereoSGM::Impl
 {
 public:
 	Impl(int width, int height, int disparity_size, int src_depth, int dst_depth, int src_pitch, int dst_pitch,
 		ExecuteInOut inout_type, const Parameters& param) :
 		width_(width),
 		height_(height),
 		disp_size_(disparity_size),
 		src_pitch_(src_pitch),
 		dst_pitch_(dst_pitch),
 		param_(param)
 	{
 		// check values
 		SGM_ASSERT(src_depth == 8 || src_depth == 16 || src_depth == 32, "src depth bits must be 8, 16 or 32");
 		SGM_ASSERT(dst_depth == 8 || dst_depth == 16, "dst depth bits must be 8 or 16");
 		SGM_ASSERT(disparity_size == 64 || disparity_size == 128 || disparity_size == 256, "disparity size must be 64 or 128 or 256");
 		SGM_ASSERT(has_enough_depth(dst_depth, disparity_size, param_.min_disp, param_.subpixel),
 			"output depth bits must be sufficient for representing output value");
 		src_type_ = src_depth == 8 ? SGM_8U : src_depth == 16 ? SGM_16U : SGM_32U;
 		dst_type_ = dst_depth == 8 ? SGM_8U : SGM_16U;
 		is_src_devptr_ = (inout_type & 0x01) > 0;
 		is_dst_devptr_ = (inout_type & 0x02) > 0;
 		if (!is_src_devptr_) {
 			d_srcL_.create(height, width, src_type_, src_pitch);
 			d_srcR_.create(height, width, src_type_, src_pitch);
 		}
 		const ImageType census_type = param.census_type == CensusType::CENSUS_9x7 ? SGM_64U : SGM_32U;
 		d_censusL_.create(height, width, census_type);
 		d_censusR_.create(height, width, census_type);
 		d_censusL_.fill_zero();
 		d_censusR_.fill_zero();
 		d_tmpL_.create(height, width, SGM_16U, dst_pitch);
 		d_tmpR_.create(height, width, SGM_16U, dst_pitch);
 		if (!(is_dst_devptr_ && dst_type_ == SGM_16U)) {
 			d_dispL_.create(height, width, SGM_16U, dst_pitch);
 		}
 		d_dispR_.create(height, width, SGM_16U, dst_pitch);
 	}
 	void execute(const void* srcL, const void* srcR, void* dst)
 	{
 		if (is_src_devptr_) {
 			d_srcL_.create((void*)srcL, height_, width_, src_type_, src_pitch_);
 			d_srcR_.create((void*)srcR, height_, width_, src_type_, src_pitch_);
 		}
 		else {
 			d_srcL_.upload(srcL);
 			d_srcR_.upload(srcR);
 		}
 		if (is_dst_devptr_ && dst_type_ == SGM_16U) {
 			// when threre is no device-host copy or type conversion, use passed buffer
 			d_dispL_.create((void*)dst, height_, width_, SGM_16U, dst_pitch_);
 		}
 		// census transform
 		details::census_transform(d_srcL_, d_censusL_, param_.census_type);
 		details::census_transform(d_srcR_, d_censusR_, param_.census_type);
 		// cost aggregation
 		details::cost_aggregation(d_censusL_, d_censusR_, d_cost_, disp_size_,
 			param_.P1, param_.P2, param_.path_type, param_.min_disp);
 		// winner-takes-all
 		details::winner_takes_all(d_cost_, d_tmpL_, d_tmpR_, disp_size_,
 			param_.uniqueness, param_.subpixel, param_.path_type);
 		// post filtering
 		details::median_filter(d_tmpL_, d_dispL_);
 		details::median_filter(d_tmpR_, d_dispR_);
 		// consistency check
 		details::check_consistency(d_dispL_, d_dispR_, d_srcL_, param_.subpixel, param_.LR_max_diff);
 		details::correct_disparity_range(d_dispL_, param_.subpixel, param_.min_disp);
 		if (!is_dst_devptr_ && dst_type_ == SGM_8U) {
 			details::cast_16bit_to_8bit(d_dispL_, d_tmpL_);
 			d_tmpL_.download(dst);
 		}
 		else if (is_dst_devptr_ && dst_type_ == SGM_8U) {
 			DeviceImage d_dst(dst, height_, width_, SGM_8U, dst_pitch_);
 			details::cast_16bit_to_8bit(d_dispL_, d_dst);
 		}
 		else if (!is_dst_devptr_ && dst_type_ == SGM_16U) {
 			d_dispL_.download(dst);
 		}
 		else if (is_dst_devptr_ && dst_type_ == SGM_16U) {
 			// optimize! no-copy!
 		}
 		else {
 			std::cerr << "not impl" << std::endl;
 		}
 	}
 	int get_invalid_disparity() const
 	{
 		return (param_.min_disp - 1) * (param_.subpixel ? SUBPIXEL_SCALE : 1);
 	}
 private:
 	int width_;
 	int height_;
 	int disp_size_;
 	int src_pitch_;
 	int dst_pitch_;
 	Parameters param_;
 	ImageType src_type_;
 	ImageType dst_type_;
 	bool is_src_devptr_;
 	bool is_dst_devptr_;
 	DeviceImage d_srcL_;
 	DeviceImage d_srcR_;
 	DeviceImage d_censusL_;
 	DeviceImage d_censusR_;
 	DeviceImage d_cost_;
 	DeviceImage d_tmpL_;
 	DeviceImage d_tmpR_;
 	DeviceImage d_dispL_;
 	DeviceImage d_dispR_;
 };
 StereoSGM::Parameters::Parameters(int P1, int P2, float uniqueness, bool subpixel, PathType path_type,
 	int min_disp, int LR_max_diff, CensusType census_type)
 	: P1(P1), P2(P2), uniqueness(uniqueness), subpixel(subpixel), path_type(path_type),
 	min_disp(min_disp), LR_max_diff(LR_max_diff), census_type(census_type)
 {
 }
 StereoSGM::StereoSGM(int width, int height, int disparity_size, int src_depth, int dst_depth,
 	ExecuteInOut inout_type, const Parameters& param)
 {
 	impl_ = new Impl(width, height, disparity_size, src_depth, dst_depth, width, width, inout_type, param);
 }
 StereoSGM::StereoSGM(int width, int height, int disparity_size, int src_depth, int dst_depth, int src_pitch, int dst_pitch,
 	ExecuteInOut inout_type, const Parameters& param)
 {
 	impl_ = new Impl(width, height, disparity_size, src_depth, dst_depth, src_pitch, dst_pitch, inout_type, param);
 }
 StereoSGM::~StereoSGM()
 {
 	delete impl_;
 }
 void StereoSGM::execute(const void* srcL, const void* srcR, void* dst)
 {
 	impl_->execute(srcL, srcR, dst);
 }
 int StereoSGM::get_invalid_disparity() const
 {
 	return impl_->get_invalid_disparity();
 }
 } // namespace sgm
@@ -0,0 +1,145 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include <libsgm_wrapper.h>
 namespace sgm
 {
 LibSGMWrapper::LibSGMWrapper(int numDisparity, int P1, int P2, float uniquenessRatio, bool subpixel, PathType pathType, int minDisparity, int lrMaxDiff, CensusType censusType)
 	: sgm_(nullptr), numDisparity_(numDisparity), param_(P1, P2, uniquenessRatio, subpixel, pathType, minDisparity, lrMaxDiff, censusType), prev_(nullptr) {}
 LibSGMWrapper::~LibSGMWrapper() = default;
 int LibSGMWrapper::getNumDisparities() const { return numDisparity_; }
 float LibSGMWrapper::getUniquenessRatio() const { return param_.uniqueness; }
 int LibSGMWrapper::getP1() const { return param_.P1; }
 int LibSGMWrapper::getP2() const { return param_.P2; }
 bool LibSGMWrapper::hasSubpixel() const { return param_.subpixel; }
 PathType LibSGMWrapper::getPathType() const { return param_.path_type; }
 int LibSGMWrapper::getMinDisparity() const { return param_.min_disp; }
 int LibSGMWrapper::getLrMaxDiff() const { return param_.LR_max_diff; }
 CensusType LibSGMWrapper::getCensusType() const { return param_.census_type; }
 int LibSGMWrapper::getInvalidDisparity() const
 {
 	return (param_.min_disp - 1) * (param_.subpixel ? StereoSGM::SUBPIXEL_SCALE : 1);
 }
 struct LibSGMWrapper::Creator
 {
 	int width;
 	int height;
 	int src_pitch;
 	int dst_pitch;
 	int input_depth_bits;
 	int output_depth_bits;
 	sgm::ExecuteInOut inout_type;
 	bool operator==(const Creator& rhs) const
 	{
 		return
 			width == rhs.width
 			&& height == rhs.height
 			&& src_pitch == rhs.src_pitch
 			&& dst_pitch == rhs.dst_pitch
 			&& input_depth_bits == rhs.input_depth_bits
 			&& output_depth_bits == rhs.output_depth_bits
 			&& inout_type == rhs.inout_type;
 	}
 	bool operator!=(const Creator& rhs) const
 	{
 		return !(*this == rhs);
 	}
 	StereoSGM* createStereoSGM(int disparity_size, const StereoSGM::Parameters& param)
 	{
 		return new StereoSGM(width, height, disparity_size, input_depth_bits, output_depth_bits, src_pitch, dst_pitch, inout_type, param);
 	}
 #ifdef BUILD_OPENCV_WRAPPER
 	Creator(const cv::cuda::GpuMat& src, const cv::cuda::GpuMat& dst)
 	{
 		const int depth = src.depth();
 		CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
 		width = src.cols;
 		height = src.rows;
 		src_pitch = static_cast<int>(src.step1());
 		dst_pitch = static_cast<int>(dst.step1());
 		input_depth_bits = static_cast<int>(src.elemSize1()) * 8;
 		output_depth_bits = static_cast<int>(dst.elemSize1()) * 8;
 		inout_type = sgm::EXECUTE_INOUT_CUDA2CUDA;
 	}
 	Creator(const cv::Mat& src, const cv::Mat& dst)
 	{
 		const int depth = src.depth();
 		CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
 		width = src.cols;
 		height = src.rows;
 		src_pitch = static_cast<int>(src.step1());
 		dst_pitch = static_cast<int>(dst.step1());
 		input_depth_bits = static_cast<int>(src.elemSize1()) * 8;
 		output_depth_bits = static_cast<int>(dst.elemSize1()) * 8;
 		inout_type = sgm::EXECUTE_INOUT_HOST2HOST;
 	}
 #endif // BUILD_OPRENCV_WRAPPER
 };
 #ifdef BUILD_OPENCV_WRAPPER
 void LibSGMWrapper::execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity)
 {
 	const cv::Size size = I1.size();
 	CV_Assert(size == I2.size());
 	CV_Assert(I1.type() == I2.type());
 	const int depth = I1.depth();
 	CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
 	if (disparity.size() != size || disparity.depth() != CV_16S) {
 		disparity.create(size, CV_16S);
 	}
 	std::unique_ptr<Creator> creator(new Creator(I1, disparity));
 	if (!sgm_ || !prev_ || *creator != *prev_) {
 		sgm_.reset(creator->createStereoSGM(numDisparity_, param_));
 	}
 	prev_ = std::move(creator);
 	sgm_->execute(I1.data, I2.data, disparity.data);
 }
 void LibSGMWrapper::execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity)
 {
 	const cv::Size size = I1.size();
 	CV_Assert(size == I2.size());
 	CV_Assert(I1.type() == I2.type());
 	const int depth = I1.depth();
 	CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
 	if (disparity.size() != size || disparity.depth() != CV_16S) {
 		disparity.create(size, CV_16S);
 	}
 	std::unique_ptr<Creator> creator(new Creator(I1, disparity));
 	if (!sgm_ || !prev_ || *creator != *prev_) {
 		sgm_.reset(creator->createStereoSGM(numDisparity_, param_));
 	}
 	prev_ = std::move(creator);
 	sgm_->execute(I1.data, I2.data, disparity.data);
 }
 #endif // BUILD_OPENCV_WRAPPER
 } // namespace sgm
@@ -0,0 +1,295 @@
 /*
 Copyright 2016 Fixstars Corporation
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http ://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 #include "internal.h"
 #include <cuda_runtime.h>
 #include "host_utility.h"
 namespace
 {
 const int BLOCK_X = 16;
 const int BLOCK_Y = 16;
 const int KSIZE = 3;
 const int RADIUS = KSIZE / 2;
 const int KSIZE_SQ = KSIZE * KSIZE;
 template <typename T>
 __device__ inline void swap(T& x, T& y)
 {
 	T tmp(x);
 	x = y;
 	y = tmp;
 }
 // sort, min, max of 1 element
 template <typename T, int V = 1> __device__ inline void dev_sort(T& x, T& y) { if (x > y) swap(x, y); }
 template <typename T, int V = 1> __device__ inline void dev_min(T& x, T& y) { x = min(x, y); }
 template <typename T, int V = 1> __device__ inline void dev_max(T& x, T& y) { y = max(x, y); }
 // sort, min, max of 2 elements
 __device__ inline void dev_sort_2(uint32_t& x, uint32_t& y)
 {
 	const uint32_t mask = __vcmpgtu2(x, y);
 	const uint32_t tmp = (x ^ y) & mask;
 	x ^= tmp;
 	y ^= tmp;
 }
 __device__ inline void dev_min_2(uint32_t& x, uint32_t& y) { x = __vminu2(x, y); }
 __device__ inline void dev_max_2(uint32_t& x, uint32_t& y) { y = __vmaxu2(x, y); }
 template <> __device__ inline void dev_sort<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_sort_2(x, y); }
 template <> __device__ inline void dev_min<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_min_2(x, y); }
 template <> __device__ inline void dev_max<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_max_2(x, y); }
 // sort, min, max of 4 elements
 __device__ inline void dev_sort_4(uint32_t& x, uint32_t& y)
 {
 	const uint32_t mask = __vcmpgtu4(x, y);
 	const uint32_t tmp = (x ^ y) & mask;
 	x ^= tmp;
 	y ^= tmp;
 }
 __device__ inline void dev_min_4(uint32_t& x, uint32_t& y) { x = __vminu4(x, y); }
 __device__ inline void dev_max_4(uint32_t& x, uint32_t& y) { y = __vmaxu4(x, y); }
 template <> __device__ inline void dev_sort<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_sort_4(x, y); }
 template <> __device__ inline void dev_min<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_min_4(x, y); }
 template <> __device__ inline void dev_max<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_max_4(x, y); }
 template <typename T, int V = 1>
 __device__ inline void median_selection_network_9(T* buf)
 {
 #define SWAP_OP(i, j) dev_sort<T, V>(buf[i], buf[j])
 #define MIN_OP(i, j) dev_min<T, V>(buf[i], buf[j])
 #define MAX_OP(i, j) dev_max<T, V>(buf[i], buf[j])
 	SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
 	SWAP_OP(1, 2); SWAP_OP(4, 5); SWAP_OP(7, 8);
 	SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
 	MAX_OP(0, 3); MAX_OP(3, 6);
 	SWAP_OP(1, 4); MIN_OP(4, 7); MAX_OP(1, 4);
 	MIN_OP(5, 8); MIN_OP(2, 5);
 	SWAP_OP(2, 4); MIN_OP(4, 6); MAX_OP(2, 4);
 #undef SWAP_OP
 #undef MIN_OP
 #undef MAX_OP
 }
 template <typename T, int V = 1>
 __device__ inline T median(T* buf)
 {
 	median_selection_network_9<T, V>(buf);
 	return buf[KSIZE_SQ / 2];
 }
 __global__ void median_kernel_3x3_8u(const uint8_t* src, uint8_t* dst, int w, int h, int p)
 {
 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 	if (x >= w || y >= h)
 		return;
 	if (x >= RADIUS && x < w - RADIUS && y >= RADIUS && y < h - RADIUS) {
 		uint8_t buf[KSIZE_SQ];
 		for (int i = 0; i < KSIZE_SQ; i++)
 			buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
 		dst[y * p + x] = median(buf);
 	}
 	else {
 		dst[y * p + x] = 0;
 	}
 }
 __global__ void median_kernel_3x3_16u(const uint16_t* src, uint16_t* dst, int w, int h, int p)
 {
 	const int x = blockIdx.x * blockDim.x + threadIdx.x;
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 	if (x >= w || y >= h)
 		return;
 	if (x >= RADIUS && x < w - RADIUS && y >= RADIUS && y < h - RADIUS) {
 		uint16_t buf[KSIZE_SQ];
 		for (int i = 0; i < KSIZE_SQ; i++)
 			buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
 		dst[y * p + x] = median(buf);
 	}
 	else {
 		dst[y * p + x] = 0;
 	}
 }
 __global__ void median_kernel_3x3_8u_v4(const uint8_t* src, uint8_t* dst, int w, int h, int pitch)
 {
 	const int x_4 = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 	if (y >= h)
 		return;
 	if (y < RADIUS || y >= h - RADIUS) {
 		for (int x = x_4; x < min(x_4 + 4, w); x++)
 			dst[y * pitch + x] = 0;
 		return;
 	}
 	uint32_t buf[KSIZE_SQ];
 	if (x_4 >= 4 && x_4 + 7 < w)
 	{
 		buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 4]);
 		buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 0]);
 		buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 + 4]);
 		buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 4]);
 		buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 0]);
 		buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 + 4]);
 		buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 4]);
 		buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 0]);
 		buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 + 4]);
 		buf[0] = (buf[1] << 8) | (buf[0] >> 24);
 		buf[2] = (buf[1] >> 8) | (buf[2] << 24);
 		buf[3] = (buf[4] << 8) | (buf[3] >> 24);
 		buf[5] = (buf[4] >> 8) | (buf[5] << 24);
 		buf[6] = (buf[7] << 8) | (buf[6] >> 24);
 		buf[8] = (buf[7] >> 8) | (buf[8] << 24);
 		*((uint32_t*)&dst[y * pitch + x_4]) = median<uint32_t, 4>(buf);
 	}
 	else if (x_4 < w) {
 		for (int x = x_4; x < min(x_4 + 4, w); x++) {
 			if (x >= RADIUS && x < w - RADIUS) {
 				uint8_t* buf_u8 = (uint8_t*)buf;
 				for (int i = 0; i < KSIZE_SQ; i++)
 					buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
 				dst[y * pitch + x] = median(buf_u8);
 			}
 			else {
 				dst[y * pitch + x] = 0;
 			}
 		}
 	}
 }
 __global__ void median_kernel_3x3_16u_v2(const uint16_t* src, uint16_t* dst, int w, int h, int pitch)
 {
 	const int x_2 = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
 	const int y = blockIdx.y * blockDim.y + threadIdx.y;
 	if (y >= h)
 		return;
 	if (y < RADIUS || y >= h - RADIUS) {
 		for (int x = x_2; x < min(x_2 + 2, w); x++)
 			dst[y * pitch + x] = 0;
 		return;
 	}
 	uint32_t buf[KSIZE_SQ];
 	if (x_2 >= 2 && x_2 + 3 < w)
 	{
 		buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 2]);
 		buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 0]);
 		buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 + 2]);
 		buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 2]);
 		buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 0]);
 		buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 + 2]);
 		buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 2]);
 		buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 0]);
 		buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 + 2]);
 		buf[0] = (buf[1] << 16) | (buf[0] >> 16);
 		buf[2] = (buf[1] >> 16) | (buf[2] << 16);
 		buf[3] = (buf[4] << 16) | (buf[3] >> 16);
 		buf[5] = (buf[4] >> 16) | (buf[5] << 16);
 		buf[6] = (buf[7] << 16) | (buf[6] >> 16);
 		buf[8] = (buf[7] >> 16) | (buf[8] << 16);
 		*((uint32_t*)&dst[y * pitch + x_2]) = median<uint32_t, 2>(buf);
 	}
 	else if (x_2 < w) {
 		for (int x = x_2; x < min(x_2 + 2, w); x++) {
 			if (x >= RADIUS && x < w - RADIUS) {
 				uint16_t* buf_u16 = (uint16_t*)buf;
 				for (int i = 0; i < KSIZE_SQ; i++)
 					buf_u16[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
 				dst[y * pitch + x] = median(buf_u16);
 			}
 			else {
 				dst[y * pitch + x] = 0;
 			}
 		}
 	}
 }
 } // namespace
 namespace sgm
 {
 namespace details
 {
 void median_filter(const DeviceImage& src, DeviceImage& dst)
 {
 	const int w = src.cols;
 	const int h = src.rows;
 	const int pitch = src.step;
 	dst.create(h, w, src.type, src.step);
 	const dim3 block(BLOCK_X, BLOCK_Y);
 	if (src.type == SGM_8U) {
 		using T = uint8_t;
 		if (pitch % 4 == 0) {
 			const dim3 grid(divUp(divUp(w, 4), block.x), divUp(h, block.y));
 			median_kernel_3x3_8u_v4<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
 		}
 		else {
 			const dim3 grid(divUp(w, block.x), divUp(h, block.y));
 			median_kernel_3x3_8u<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
 		}
 	}
 	else if (src.type == SGM_16U) {
 		using T = uint16_t;
 		if (pitch % 2 == 0) {
 			const dim3 grid(divUp(divUp(w, 2), block.x), divUp(h, block.y));
 			median_kernel_3x3_16u_v2<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
 		}
 		else {
 			const dim3 grid(divUp(w, block.x), divUp(h, block.y));
 			median_kernel_3x3_16u<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
 		}
 	}
 	CUDA_CHECK(cudaGetLastError());
 }
 } // namespace details
 } // namespace sgm
--- a/Show More
+++ b/Show More