Initial commit: Speckle-Scanner 3D pipeline with setup README

2026-06-10 03:09:05 +05:00
commit 1765934846
375 changed files with 123081 additions and 0 deletions
@@ -0,0 +1,2 @@
+include/libsgm_config.h
+build/
@@ -0,0 +1,66 @@
+image: adaskit/libsgm:0.3-opencv4
+
+variables:
+  GIT_SUBMODULE_STRATEGY: recursive
+
+stages:
+  - build
+  - test
+
+.build_template: &build_definition
+  stage: build
+  tags:
+    - docker
+  script:
+    - ldconfig
+    - cmake . -DBUILD_OPENCV_WRAPPER="ON" -DENABLE_SAMPLES=${build_samples} -DLIBSGM_SHARED=${build_shared} -DENABLE_TESTS=${build_tests}
+    - make
+
+build:samples_on:shared:
+  variables:
+    build_samples: "ON"
+    build_shared: "ON"
+    build_tests: "OFF"
+  <<: *build_definition
+
+build:samples_on:static:
+  variables:
+    build_samples: "ON"
+    build_shared: "OFF"
+    build_tests: "OFF"
+  <<: *build_definition
+
+build:samples_off:shared:
+  variables:
+    build_samples: "OFF"
+    build_shared: "ON"
+    build_tests: "OFF"
+  <<: *build_definition
+
+build:samples_off:static:
+  variables:
+    build_samples: "OFF"
+    build_shared: "OFF"
+    build_tests: "OFF"
+  <<: *build_definition
+
+build:test:
+  variables:
+    build_samples: "OFF"
+    build_shared: "OFF"
+    build_tests: "ON"
+  artifacts:
+    paths:
+      - ./test/sgm-test
+    expire_in: 1d
+  <<: *build_definition
+
+test:
+  stage: test
+  tags:
+    - nvidia-docker
+  script:
+    - ldconfig
+    - cuda-memcheck --leak-check full ./test/sgm-test
+  dependencies:
+    - build:test
@@ -0,0 +1,3 @@
+[submodule "test/googletest"]
+	path = test/googletest
+	url = https://github.com/google/googletest.git
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.18)
+
+option(ENABLE_ZED_DEMO      "Build a Demo using ZED Camera" OFF)
+option(ENABLE_SAMPLES       "Build samples" OFF)
+option(ENABLE_TESTS         "Test library" OFF)
+option(LIBSGM_SHARED        "Build a shared library" OFF)
+option(BUILD_OPENCV_WRAPPER "Make library compatible with cv::Mat and cv::cuda::GpuMat of OpenCV" OFF)
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES "52;61;72;75;86")
+endif()
+
+project(libSGM VERSION 3.1.0)
+
+configure_file(
+${PROJECT_SOURCE_DIR}/include/libsgm_config.h.in
+${PROJECT_SOURCE_DIR}/include/libsgm_config.h
+)
+
+add_subdirectory(src)
+
+if(ENABLE_SAMPLES)
+	add_subdirectory(sample)
+endif()
+
+if(ENABLE_TESTS)
+	add_subdirectory(test)
+endif()
@@ -0,0 +1,33 @@
+###############################################################################
+# Find LibSGM
+#
+# This sets the following variables:
+# LIBSGM_FOUND - True if LIBSGM was found.
+# LIBSGM_INCLUDE_DIRS - Directories containing the LIBSGM include files.
+# LIBSGM_LIBRARY - Libraries needed to use LIBSGM.
+
+# Find lib
+set(LIBSGM_FOUND FALSE CACHE BOOL "" FORCE)
+find_library(LIBSGM_LIBRARY
+    NAMES sgm libsgm
+    PATH_SUFFIXES lib/
+)
+
+# Find include
+find_path(LIBSGM_INCLUDE_DIRS
+    NAMES libsgm.h
+    PATH_SUFFIXES include/
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LibSGM DEFAULT_MSG LIBSGM_LIBRARY LIBSGM_INCLUDE_DIRS)
+
+message(STATUS "(LIBSGM_FOUND : ${LIBSGM_FOUND} include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})")
+
+mark_as_advanced(LIBSGM_FOUND)
+
+if(LIBSGM_FOUND)
+    set(LIBSGM_FOUND TRUE CACHE BOOL "" FORCE)
+    set(LIBSGM_LIBRARIES ${LIBSGM_LIBRARY})
+    message(STATUS "LibSGM found ( include: ${LIBSGM_INCLUDE_DIRS}, lib: ${LIBSGM_LIBRARY})")
+endif()
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,335 @@
+# installation process for US:
+Install Anaconda and CUDA Toolkit (compute capability >= 3.5)
+Check if cmake is installed.
+```
+$ cmake --version
+```
+if version is <3.18
+```
+$ sudo apt remove cmake #Only if cmake is installed with <3.18 version
+$ wget https://github.com/Kitware/CMake/releases/download/v3.21.5/cmake-3.21.5.tar.gz
+$ tar -xzvf cmake-3.21.5.tar.gz
+$ cd cmake-3.21.5
+$ ./bootstrap
+$ make
+$ sudo make install
+$ cmake --version
+```
+Now if it is giving error of not found
+```
+$ find /usr/local/bin -name cmake
+```
+if path exists then close the terminal and open new terminal then again check the version.
+```
+$ cmake --version
+```
+## Environment:
+
+Create an environment (named libsgm) in conda
+```
+$ conda create --name libsgm
+$ conda activate libsgm
+```
+Installing Fixstars LibSGM:
+```
+$ git clone https://gitea.subseascanning.com/dejhost/libSGM.git
+$ cd libSGM
+$ git submodule update --init
+$ mkdir build  
+$ cd build
+$ cmake ../
+$ make
+```
+## Sample Execution
+```
+$ pwd
+.../libSGM
+$ cd build
+$ cmake .. -DENABLE_SAMPLES=on
+$ make
+$ cd sample
+```
+place /data folder in libSGM/build/sample/data.
+ 
+Now run the command once to confirm the installation and working of LibSGM.
+For single image pair you use this stereosgm_new file
+The disparity map will be saved on the same path which contains the executeable stereosgm_new file (.../libSGM/build/sample)
+```
+$ ./stereosgm_new data/lc00012.bmp data/rc00012.bmp
+```
+For multiple pairs one after another you can use stereosgm_image it will save disparity.xml files in output directory.
+```
+$ ./stereosgm_image data/lc%05d.bmp data/rc%05d.bmp
+```
+
+---
+
+## **Pipeline Usage (Automated Path Resolution)**
+
+Use `run_sgm_pipeline.py` to run libSGM across the project folder structure automatically.
+It picks the **last rectified image pair** (highest timestamp) from each scan's `02_rect_images/` folder,
+runs `stereosgm_new`, and saves results to `03_sgm_disp_map/`.
+
+### **Folder structure assumed**
+
+```
+~/Speckle-Scanner_Processing_data/
+└── <project>/
+    └── <date>/
+        └── <session>/
+            └── <ScanXXXXXX>/
+                ├── 02_rect_images/     ← lc_ts<last>.png + rc_ts<same>.png (input)
+                ├── 03_sgm_disp_map/    ← disparity.xml + disparity_color.png (created)
+                └── 05_sgm_pcl/         ← untouched
+```
+
+Pairs are matched on the shared `ts` token (e.g. `ts1634840093`). Both formats work:
+`lc_ts1634840093_ck….png` / `rc_ts1634840093_ck….png` and `lc_ts1634840093.png` / `rc_ts1634840093.png`.
+
+### **Commands**
+
+```bash
+cd ~/Speckle-Scanner/05_disparity/libsgm
+
+# Process ALL scans in a session
+python run_sgm_pipeline.py \
+  --project Olsen_wings \
+  --date    2026-05-12 \
+  --session session1
+
+# Process ALL sessions on a date (omit --session)
+python run_sgm_pipeline.py \
+  --project Olsen_wings \
+  --date    2026-05-12
+
+# Process a SINGLE scan
+python run_sgm_pipeline.py \
+  --project Olsen_wings \
+  --date    2026-05-12 \
+  --session session1 \
+  --scan    Scan000001
+
+# Custom SGM parameters
+python run_sgm_pipeline.py \
+  --project    Olsen_wings \
+  --date       2026-05-12 \
+  --session    session1 \
+  --disp_size  128 \
+  --P1         8 \
+  --P2         32 \
+  --min_disp   0 \
+  --num_paths  8 \
+  --census_type 1
+```
+
+### **Pipeline parameters**
+
+| Parameter       | Default | Description                                                                      |
+|-----------------|---------|----------------------------------------------------------------------------------|
+| `--project`     | —       | Project name (e.g. `Olsen_wings`)                                                |
+| `--date`        | —       | Date string (e.g. `2026-05-12`)                                                  |
+| `--session`     | all     | Session name (e.g. `session1`); omit to process **all sessions** on that date    |
+| `--scan`        | all     | Single scan (e.g. `Scan000001`); omit to process all scans in the session        |
+| `--disp_size`   | `256`   | Maximum disparity value (64, 128, or 256)                                        |
+| `--P1`          | `10`    | SGM penalty for disparity change of ±1                                           |
+| `--P2`          | `120`   | SGM penalty for disparity change > 1                                             |
+| `--uniqueness`  | `0.80`  | Uniqueness ratio threshold                                                        |
+| `--num_paths`   | `8`     | Scanlines for cost aggregation (4 or 8)                                          |
+| `--min_disp`    | `-160`  | Minimum disparity value                                                           |
+| `--LR_max_diff` | `1`     | Maximum allowed left-right disparity difference                                  |
+| `--census_type` | `1`     | Census transform type: 0=CENSUS_9x7, 1=SYMMETRIC_CENSUS_9x7                     |
+
+### **What gets saved in `03_sgm_disp_map/`**
+
+| File | Description |
+|------|-------------|
+| `disparity.xml` | Raw disparity matrix (OpenCV FileStorage format, CV_16S) |
+| `disparity_color.png` | Colorized disparity image (TURBO colormap, 8-bit) |
+
+---
+
+## **Direct Binary Usage**
+
+Run `stereosgm_new` manually with explicit paths (must run from the build/sample directory or use full paths):
+
+```bash
+cd ~/Speckle-Scanner/05_disparity/libsgm/build/sample
+
+# Default parameters, save to current directory
+./stereosgm_new data/lc00012.bmp data/rc00012.bmp
+
+# Save to a specific output folder, no display window
+./stereosgm_new \
+  /path/to/lc_image.png \
+  /path/to/rc_image.png \
+  --output_dir=/path/to/03_sgm_disp_map \
+  --no_display=1 \
+  --disp_size=128 --P1=8 --P2=32
+```
+
+---
+
+## **Available Parameters**
+
+| Parameter            | Default Value  | Description                                                                |
+| -------------------- | -------------- | -------------------------------------------------------------------------- |
+| `@left-image-format` | `none`         | Format string for the path to input left image (e.g., "left/img_%04d.png") |
+| `@right-image-format`| `none`         | Format string for the path to input right image                            |
+| `--disp_size`        | `256`          | Maximum possible disparity value                                           |
+| `--P1`               | `10`           | Penalty for disparity change of ±1                                         |
+| `--P2`               | `120`          | Penalty for disparity change > 1                                           |
+| `--uniqueness`       | `0.80`         | Margin ratio for uniqueness constraint                                     |
+| `--num_paths`        | `8`            | Number of scanlines used in cost aggregation (4 or 8)                      |
+| `--min_disp`         | `-160`         | Minimum disparity value                                                    |
+| `--LR_max_diff`      | `1`            | Maximum allowed left-right disparity difference                            |
+| `--census_type`      | `1`            | Census transform type (0: 5x5, 1: 9x7, 2: 11x9)                            |
+| `--interval`         | `1`            | Polling interval (in seconds) for checking new stereo image pairs          |
+| `--output_dir`       | `.`            | Directory to save `disparity.xml` and `disparity_color.png`                |
+| `--no_display`       | `0`            | Set to `1` to skip interactive display window (required for pipeline/headless use) |
+| `--help or -h`       |                | Show help message                                                          |
+
+### **Custom Parameters** 
+
+
+You can override any parameter through command-line arguments. Below is an example with some customized parameters:
+
+```bash
+./stereosgm_image data/lc%05d.bmp data/rc%05d.bmp \
+  --disp_size=128 --P1=8 --P2=32 --interval=2
+```
+
+
+# libSGM(Orignal)
+---
+A CUDA implementation performing Semi-Global Matching.
+
+## Introduction
+---
+
+libSGM is library that implements in CUDA the Semi-Global Matching algorithm.  
+From a pair of appropriately calibrated input images, we can obtain the disparity map.
+
+## Features
+---
+Because it uses CUDA, we can compute the disparity map at high speed.
+
+## Performance
+The libSGM performance obtained from benchmark sample
+### Settings
+- image size : 1024 x 440
+- disparity size : 128
+- sgm path : 4 path
+- subpixel : enabled
+
+### Results
+|Device|CUDA version|Processing Time[Milliseconds]|FPS|
+|---|---|---|---|
+|GTX 1080 Ti|10.1|2.0|495.1|
+|GeForce RTX 3080|11.1|1.5|651.3|
+|Tegra X2|10.0|28.5|35.1|
+|Xavier(MODE_15W)|10.2|17.3|57.7|
+|Xavier(MAXN)|10.2|9.0|110.7|
+
+## Requirements
+|Package Name|Minimum Requirements|Note
+|---|---|---|
+|CMake|version >= 3.18||
+|CUDA Toolkit|compute capability >= 3.5|
+|OpenCV|version >= 3.4.8|for samples|
+|OpenCV CUDA module|version >= 3.4.8|for OpenCV wrapper|
+|ZED SDK|version >= 3.0|for ZED sample|
+
+## Build Instructions
+```
+$ git clone https://github.com/fixstars/libSGM.git
+$ cd libSGM
+$ git submodule update --init  # It is needed if ENABLE_TESTS option is set to ON
+$ mkdir build
+$ cd build
+$ cmake ../  # Several options available
+$ make
+```
+
+## Sample Execution
+```
+$ pwd
+.../libSGM
+$ cd build
+$ cmake .. -DENABLE_SAMPLES=on
+$ make
+$ cd sample
+$ ./stereosgm_movie <left image path format> <right image path format> <disparity_size>
+left image path format: the format used for the file paths to the left input images
+right image path format: the format used for the file paths to the right input images
+disparity_size: the maximum number of disparities (optional)
+```
+
+"disparity_size" is optional. By default, it is 128.
+
+Next, we explain the meaning of the "left image path format" and "right image path format".  
+When provided with the following set of files, we should pass the "path formats" given below.
+```
+left_image_0000.pgm
+left_image_0001.pgm
+left_image_0002.pgm
+left_image_0003.pgm
+...
+
+right_image_0000.pgm
+right_image_0001.pgm
+right_image_0002.pgm
+right_image_0003.pgm
+```
+
+```
+$ ./stereosgm_movie left_image_%04d.pgm right_image_%04d.pgm
+```
+
+The sample images available at [Daimler Urban Scene Segmentation Benchmark Dataset 2014](http://www.6d-vision.com/scene-labeling) are used to test the software.
+
+## Test Execution
+libSGM uses [Google Test](https://github.com/google/googletest) for tests as Git submodule.  
+So, we need to init submodule by following command firstly.
+
+```
+$ pwd
+.../libSGM
+$ git submodule update --init
+```
+
+We can run tests after a build.
+
+```
+$ pwd
+.../libSGM
+$ cd build
+$ cd test
+$ ./sgm-test
+```
+
+Test code compares our implementation of each functions to naive implementation.
+
+## Python pipeline runner dependencies
+
+`run_sgm_pipeline.py` uses only the Python standard library. Disparity is computed by the compiled `stereosgm_new` binary.
+
+See `requirements.txt` in this folder for system build requirements (CUDA, CMake, OpenCV C++).
+
+```bash
+# No pip packages needed for the Python runner.
+# Build the binary first (see above), then:
+cd ~/Speckle-Scanner/05_disparity/libsgm
+python run_sgm_pipeline.py --project <project> --date <date>
+```
+
+## Author
+The "adaskit Team"  
+
+The adaskit is an open-source project created by [Fixstars Corporation](https://www.fixstars.com/) and its subsidiary companies including [Fixstars Autonomous Technologies](https://at.fixstars.com/), aimed at contributing to the ADAS industry by developing high-performance implementations for algorithms with high computational cost.
+
+## License
+Apache License 2.0
+
+
+
+
@@ -0,0 +1,180 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __LIBSGM_H__
+#define __LIBSGM_H__
+
+/**
+* @mainpage stereo-sgm
+* See sgm::StereoSGM
+*/
+
+/**
+* @file libsgm.h
+* stereo-sgm main header
+*/
+
+#include "libsgm_config.h"
+
+#if defined(LIBSGM_SHARED)
+#if defined(WIN32) || defined(_WIN32)
+#if defined sgm_EXPORTS
+#define LIBSGM_API __declspec(dllexport)
+#else
+#define LIBSGM_API __declspec(dllimport)
+#endif
+#else
+#define LIBSGM_API __attribute__((visibility("default")))
+#endif
+#else
+#define LIBSGM_API
+#endif
+
+namespace sgm
+{
+
+/**
+* @brief Indicates input/output pointer type.
+*/
+enum ExecuteInOut
+{
+	EXECUTE_INOUT_HOST2HOST = (0 << 1) | 0,
+	EXECUTE_INOUT_HOST2CUDA = (1 << 1) | 0,
+	EXECUTE_INOUT_CUDA2HOST = (0 << 1) | 1,
+	EXECUTE_INOUT_CUDA2CUDA = (1 << 1) | 1,
+};
+
+/**
+* @brief Indicates number of scanlines which will be used.
+*/
+enum class PathType
+{
+	SCAN_4PATH, //>! Horizontal and vertical paths.
+	SCAN_8PATH  //>! Horizontal, vertical and oblique paths.
+};
+
+/**
+* @brief Indicates census type which will be used.
+*/
+enum class CensusType
+{
+	CENSUS_9x7,
+	SYMMETRIC_CENSUS_9x7
+};
+
+/**
+* @brief StereoSGM class
+*/
+class StereoSGM
+{
+public:
+
+	static const int SUBPIXEL_SHIFT = 4;
+	static const int SUBPIXEL_SCALE = (1 << SUBPIXEL_SHIFT);
+
+	/**
+	* @brief Available options for StereoSGM
+	*/
+	struct Parameters
+	{
+		int P1;
+		int P2;
+		float uniqueness;
+		bool subpixel;
+		PathType path_type;
+		int min_disp;
+		int LR_max_diff;
+		CensusType census_type;
+
+		/**
+		* @param P1 Penalty on the disparity change by plus or minus 1 between nieghbor pixels.
+		* @param P2 Penalty on the disparity change by more than 1 between neighbor pixels.
+		* @param uniqueness Margin in ratio by which the best cost function value should be at least second one.
+		* @param subpixel Disparity value has 4 fractional bits if subpixel option is enabled.
+		* @param path_type Number of scanlines used in cost aggregation.
+		* @param min_disp Minimum possible disparity value.
+		* @param LR_max_diff Acceptable difference pixels which is used in LR check consistency. LR check consistency will be disabled if this value is set to negative.
+		* @param census_type Type of census transform.
+		*/
+		LIBSGM_API Parameters(int P1 = 10, int P2 = 120, float uniqueness = 0.95f, bool subpixel = false, PathType path_type = PathType::SCAN_8PATH,
+			int min_disp = 0, int LR_max_diff = 1, CensusType census_type = CensusType::SYMMETRIC_CENSUS_9x7);
+	};
+
+	/**
+	* @param width Processed image's width.
+	* @param height Processed image's height.
+	* @param disparity_size It must be 64, 128 or 256.
+	* @param input_depth_bits Processed image's bits per pixel. It must be 8, 16 or 32.
+	* @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16.
+	* @param inout_type Specify input/output pointer type. See sgm::EXECUTE_TYPE.
+	* @attention
+	* output_depth_bits must be set to 16 when subpixel is enabled.
+	*/
+	LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits,
+		ExecuteInOut inout_type, const Parameters& param = Parameters());
+
+	/**
+	* @param width Processed image's width.
+	* @param height Processed image's height.
+	* @param disparity_size It must be 64, 128 or 256.
+	* @param input_depth_bits Processed image's bits per pixel. It must be 8, 16 or 32.
+	* @param output_depth_bits Disparity image's bits per pixel. It must be 8 or 16.
+	* @param src_pitch Source image's pitch (pixels).
+	* @param dst_pitch Destination image's pitch (pixels).
+	* @param inout_type Specify input/output pointer type. See sgm::EXECUTE_TYPE.
+	* @attention
+	* output_depth_bits must be set to 16 when subpixel is enabled.
+	*/
+	LIBSGM_API StereoSGM(int width, int height, int disparity_size, int input_depth_bits, int output_depth_bits, int src_pitch, int dst_pitch,
+		ExecuteInOut inout_type, const Parameters& param = Parameters());
+
+	LIBSGM_API virtual ~StereoSGM();
+
+	/**
+	* Execute stereo semi global matching.
+	* @param left_pixels  A pointer stored input left image.
+	* @param right_pixels A pointer stored input right image.
+	* @param dst          Output pointer. User must allocate enough memory.
+	* @attention
+	* You need to allocate dst memory at least width x height x sizeof(element_type) bytes.
+	* The element_type is uint8_t for output_depth_bits == 8 and uint16_t for output_depth_bits == 16.
+	* Note that dst element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
+	* Value of Invalid disparity is equal to return value of `get_invalid_disparity` member function.
+	*/
+	LIBSGM_API void execute(const void* left_pixels, const void* right_pixels, void* dst);
+
+	/**
+	* Generate invalid disparity value from Parameter::min_disp and Parameter::subpixel
+	* @attention
+	* Cast properly if you receive disparity value as `unsigned` type.
+	* See sample/movie for an example of this.
+	*/
+	LIBSGM_API int get_invalid_disparity() const;
+
+private:
+
+	StereoSGM(const StereoSGM&);
+	StereoSGM& operator=(const StereoSGM&);
+
+	class Impl;
+	Impl* impl_;
+};
+
+} // namespace sgm
+
+#endif // !__LIBSGM_H__
+
+#include "libsgm_wrapper.h"
@@ -0,0 +1,13 @@
+#ifndef __LIBSGM_CONFIG_H__
+#define __LIBSGM_CONFIG_H__
+
+#cmakedefine LIBSGM_SHARED
+
+#define LIBSGM_VERSION @libSGM_VERSION@
+#define LIBSGM_VERSION_MAJOR @libSGM_VERSION_MAJOR@
+#define LIBSGM_VERSION_MINOR @libSGM_VERSION_MINOR@
+#define LIBSGM_VERSION_PATCH @libSGM_VERSION_PATCH@
+
+#cmakedefine BUILD_OPENCV_WRAPPER
+
+#endif // __LIBSGM_CONFIG_H__
@@ -0,0 +1,84 @@
+#ifndef __LIBSGM_WRAPPER_H__
+#define __LIBSGM_WRAPPER_H__
+
+#include "libsgm.h"
+
+#include <memory>
+#ifdef BUILD_OPENCV_WRAPPER
+#include <opencv2/core/cuda.hpp>
+#endif
+
+namespace sgm
+{
+
+/**
+* @brief LibSGMWrapper class which is wrapper for sgm::StereoSGM.
+*/
+class LibSGMWrapper
+{
+public:
+
+	/**
+	* @param numDisparity Maximum disparity minus minimum disparity.
+	* @param P1 Penalty on the disparity change by plus or minus 1 between nieghbor pixels.
+	* @param P2 Penalty on the disparity change by more than 1 between neighbor pixels.
+	* @param uniquenessRatio Margin in ratio by which the best cost function value should be at least second one.
+	* @param subpixel Disparity value has 4 fractional bits if subpixel option is enabled.
+	* @param pathType Number of scanlines used in cost aggregation.
+	* @param minDisparity Minimum possible disparity value.
+	* @param lrMaxDiff Acceptable difference pixels which is used in LR check consistency. LR check consistency will be disabled if this value is set to negative.
+	* @param censusType Type of census transform.
+	*/
+	LIBSGM_API LibSGMWrapper(int numDisparity = 128, int P1 = 10, int P2 = 120, float uniquenessRatio = 0.95f,
+		bool subpixel = false, PathType pathType = PathType::SCAN_8PATH, int minDisparity = 0, int lrMaxDiff = 1, CensusType censusType = CensusType::SYMMETRIC_CENSUS_9x7);
+	LIBSGM_API ~LibSGMWrapper();
+
+	LIBSGM_API int getNumDisparities() const;
+	LIBSGM_API int getP1() const;
+	LIBSGM_API int getP2() const;
+	LIBSGM_API float getUniquenessRatio() const;
+	LIBSGM_API bool hasSubpixel() const;
+	LIBSGM_API PathType getPathType() const;
+	LIBSGM_API int getMinDisparity() const;
+	LIBSGM_API int getLrMaxDiff() const;
+	LIBSGM_API CensusType getCensusType() const;
+	LIBSGM_API int getInvalidDisparity() const;
+
+#ifdef BUILD_OPENCV_WRAPPER
+
+	/**
+	* Execute stereo semi global matching via wrapper class.
+	* @param I1        Input left image.  Image's type is must be CV_8U, CV_16U or CV_32S
+	* @param I2        Input right image.  Image's size and type must be same with I1.
+	* @param disparity Output image.  Its memory will be allocated automatically dependent on input image size.
+	* @attention
+	* type of output image `disparity` is CV_16S.
+	* Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
+	*/
+	LIBSGM_API void execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity);
+
+	/**
+	* Execute stereo semi global matching via wrapper class.
+	* @param I1        Input left image.  Image's type is must be CV_8U, CV_16U or CV_32S.
+	* @param I2        Input right image.  Image's size and type must be same with I1.
+	* @param disparity Output image.  Its memory will be allocated automatically dependent on input image size.
+	* @attention
+	* type of output image `disparity` is CV_16S.
+	* Note that disparity element value would be multiplied StereoSGM::SUBPIXEL_SCALE if subpixel option was enabled.
+	*/
+	LIBSGM_API void execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity);
+
+#endif // BUILD_OPRENCV_WRAPPER
+
+private:
+
+	struct Creator;
+	std::unique_ptr<sgm::StereoSGM> sgm_;
+	int numDisparity_;
+	sgm::StereoSGM::Parameters param_;
+	std::unique_ptr<Creator> prev_;
+};
+
+} // namespace sgm
+
+#endif // __LIBSGM_WRAPPER_H__
@@ -0,0 +1,9 @@
+%YAML:1.0
+---
+Q: !!opencv-matrix
+   rows: 4
+   cols: 4
+   dt: d
+   data: [ 1., 0., 0., -452.58969879150391, 0., 1., 0.,
+       -732.08112335205078, 0., 0., 0., 3269.0086731896672, 0., 0.,
+       1.0200604866284457, 1125.7629393222996 ]
@@ -0,0 +1,18 @@
+# 05_disparity/libsgm — dependencies
+#
+# The Python pipeline runner (run_sgm_pipeline.py) uses only the standard library.
+# Disparity computation is done by the compiled stereosgm_new binary (CUDA C++).
+#
+# --- System build requirements (not installable via pip) ---
+#   - NVIDIA GPU with CUDA compute capability >= 3.5
+#   - CUDA Toolkit 11.x or 12.x
+#   - CMake >= 3.18
+#   - OpenCV (C++ headers + libs, for building libSGM samples)
+#
+# Build:
+#   cd ~/Speckle-Scanner/05_disparity/libsgm
+#   mkdir -p build && cd build
+#   cmake .. -DENABLE_SAMPLES=on
+#   make stereosgm_new -j4
+#
+# No pip packages required to run run_sgm_pipeline.py after the binary is built.
@@ -0,0 +1,201 @@
+"""
+Pipeline runner for libSGM stereo disparity.
+
+Resolves all paths from the project folder structure and drives
+the stereosgm_new binary for each scan in a session (or all sessions on a date).
+
+For each scan it takes the LAST matched lc_/rc_ image pair from 02_rect_images/
+(images sorted by timestamp — highest timestamp = last acquired image).
+
+Output layout per scan:
+  <processing_dir>/<project>/<date>/<session>/<scan>/
+    02_rect_images/     <- input (lc_ts<last>.png + rc_ts<same>.png)
+    03_sgm_disp_map/    <- disparity.xml + disparity_color.png  (created here)
+    05_sgm_pcl/         <- untouched
+
+Binary:
+  ~/Speckle-Scanner/05_disparity/libsgm/build/sample/stereosgm_new
+"""
+
+import sys
+import re
+import argparse
+import subprocess
+from pathlib import Path
+
+# Resolve config.py from ~/Speckle-Scanner regardless of CWD
+sys.path.insert(0, str(Path.home() / "Speckle-Scanner"))
+import config  # noqa: E402
+
+BINARY = Path(__file__).parent / "build" / "sample" / "stereosgm_new"
+
+
+def extract_ts_token(filename, prefix="lc_"):
+    """Extract ts token from lc_ts1634840093.png or lc_ts1634840093_ck....png."""
+    m = re.search(rf"^{re.escape(prefix)}(ts\d+)", filename, re.IGNORECASE)
+    if not m:
+        return None, None
+    ts_token = m.group(1).lower()
+    ts_int = int(re.search(r"\d+", ts_token).group())
+    return ts_token, ts_int
+
+
+def find_rc_for_ts(rect_dir, ts_token):
+    """Match rc image by shared ts token (ck suffix optional)."""
+    rc_matches = sorted(rect_dir.glob(f"rc_{ts_token}_*.png"))
+    if not rc_matches:
+        rc_matches = sorted(rect_dir.glob(f"rc_{ts_token}*.png"))
+    return rc_matches[0] if rc_matches else None
+
+
+def find_last_lc_rc_pair(rect_dir):
+    """Return (lc_path, rc_path) for the highest-timestamp matched pair in rect_dir."""
+    rect_dir = Path(rect_dir)
+    pairs = []
+    for lc in rect_dir.glob("lc_ts*.png"):
+        ts_token, ts_int = extract_ts_token(lc.name, "lc_")
+        if ts_token is None:
+            continue
+        rc = find_rc_for_ts(rect_dir, ts_token)
+        if rc is None:
+            continue
+        pairs.append((ts_int, lc, rc))
+
+    if not pairs:
+        return None, None
+
+    pairs.sort(key=lambda item: item[0])
+    _, lc, rc = pairs[-1]
+    return lc, rc
+
+
+def build_cmd(lc, rc, output_dir, sgm_args):
+    cmd = [
+        str(BINARY),
+        str(lc),
+        str(rc),
+        f"--output_dir={output_dir}",
+        "--no_display=1",
+    ]
+    for key, val in sgm_args.items():
+        if val is not None:
+            cmd.append(f"--{key}={val}")
+    return cmd
+
+
+def run_scan(project, date, session, scan, sgm_args):
+    rect_dir = config.PROCESSING_DIR / project / date / session / scan / "02_rect_images"
+    if not rect_dir.exists():
+        print(f"[SKIP] {session}/{scan}: 02_rect_images not found at {rect_dir}")
+        return False
+
+    lc, rc = find_last_lc_rc_pair(rect_dir)
+    if lc is None:
+        print(f"[SKIP] {session}/{scan}: no lc_ts*.png images found in {rect_dir}")
+        return False
+    if rc is None:
+        print(f"[SKIP] {session}/{scan}: no matching rc image for {lc.name}")
+        return False
+
+    output_dir = config.get_processing_step_dir(project, date, session, scan, "03_sgm_disp_map")
+
+    print(f"\n{'='*60}")
+    print(f"[SCAN] {session}/{scan}")
+    print(f"  lc     : {lc.name}")
+    print(f"  rc     : {rc.name}")
+    print(f"  output : {output_dir}")
+    print(f"{'='*60}")
+
+    cmd = build_cmd(lc, rc, output_dir, sgm_args)
+    result = subprocess.run(cmd)
+    if result.returncode != 0:
+        print(f"[FAIL] {session}/{scan} exited with code {result.returncode}")
+        return False
+    print(f"[DONE] {session}/{scan}")
+    return True
+
+
+def run_session(project, date, session, scan_arg, sgm_args):
+    if scan_arg:
+        scans = [scan_arg]
+    else:
+        scans = config.list_scan_dirs(project, date, session)
+        if not scans:
+            print(f"[WARN] No scan folders found in {project}/{date}/{session}")
+            return [], []
+        print(f"\n  Session {session}: {len(scans)} scan(s) found")
+
+    failed = []
+    for scan in scans:
+        ok = run_scan(project, date, session, scan, sgm_args)
+        if not ok:
+            failed.append(f"{session}/{scan}")
+    return scans, failed
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="libSGM disparity pipeline runner — resolves paths from project structure"
+    )
+    # Project location
+    parser.add_argument("--project", required=True,  help="Project name (e.g. Olsen_wings)")
+    parser.add_argument("--date",    required=True,  help="Date string (e.g. 2026-05-12)")
+    parser.add_argument("--session", default=None,   help="Session name (e.g. session1); omit to process ALL sessions on that date")
+    parser.add_argument("--scan",    default=None,   help="Single scan (e.g. Scan000001); omit to process all scans in the session")
+
+    # SGM parameters — all optional, forwarded to stereosgm_new
+    parser.add_argument("--disp_size",   type=int,   default=None, help="Maximum disparity value (64, 128, or 256; default 256)")
+    parser.add_argument("--P1",          type=int,   default=None, help="SGM penalty for disparity change of ±1 (default 10)")
+    parser.add_argument("--P2",          type=int,   default=None, help="SGM penalty for disparity change >1 (default 120)")
+    parser.add_argument("--uniqueness",  type=float, default=None, help="Uniqueness ratio threshold (default 0.80)")
+    parser.add_argument("--num_paths",   type=int,   default=None, choices=[4, 8], help="Scanlines for cost aggregation: 4 or 8 (default 8)")
+    parser.add_argument("--min_disp",    type=int,   default=None, help="Minimum disparity value (default -160)")
+    parser.add_argument("--LR_max_diff", type=int,   default=None, help="Max left-right disparity difference (default 1)")
+    parser.add_argument("--census_type", type=int,   default=None, choices=[0, 1], help="Census transform type: 0=CENSUS_9x7, 1=SYMMETRIC_CENSUS_9x7 (default 1)")
+
+    args = parser.parse_args()
+
+    if not BINARY.exists():
+        print(f"ERROR: stereosgm_new binary not found at {BINARY}")
+        print("Build it first: cd ~/Speckle-Scanner/05_disparity/libsgm/build && make stereosgm_new")
+        sys.exit(1)
+
+    sgm_args = {
+        "disp_size":   args.disp_size,
+        "P1":          args.P1,
+        "P2":          args.P2,
+        "uniqueness":  args.uniqueness,
+        "num_paths":   args.num_paths,
+        "min_disp":    args.min_disp,
+        "LR_max_diff": args.LR_max_diff,
+        "census_type": args.census_type,
+    }
+
+    # Determine sessions to process
+    if args.session:
+        sessions = [args.session]
+    else:
+        sessions = config.list_session_dirs(args.project, args.date)
+        if not sessions:
+            print(f"No session folders found under {args.project}/{args.date}")
+            sys.exit(1)
+        print(f"Found {len(sessions)} session(s): {sessions}")
+
+    total_scans = 0
+    all_failed  = []
+    for session in sessions:
+        scans, failed = run_session(
+            args.project, args.date, session, args.scan, sgm_args
+        )
+        total_scans += len(scans)
+        all_failed.extend(failed)
+
+    print(f"\n{'='*60}")
+    print(f"Finished: {total_scans - len(all_failed)}/{total_scans} scans succeeded.")
+    if all_failed:
+        print(f"Failed: {all_failed}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,62 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(samples LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# required packages
+find_package(OpenCV REQUIRED)
+
+set(SRCS_COMMON sample_common.cpp sample_common.h)
+
+# sample image
+add_executable(stereosgm_image stereosgm_image.cpp ${SRCS_COMMON})
+target_include_directories(stereosgm_image PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(stereosgm_image sgm ${OpenCV_LIBS})
+
+# sample movie
+add_executable(stereosgm_movie stereosgm_movie.cpp ${SRCS_COMMON})
+target_include_directories(stereosgm_movie PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(stereosgm_movie sgm ${OpenCV_LIBS})
+
+# sample mynew
+add_executable(stereosgm_new stereosgm_new.cpp ${SRCS_COMMON})
+target_include_directories(stereosgm_new PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(stereosgm_new sgm ${OpenCV_LIBS})
+
+# sample benchmark
+add_executable(stereosgm_benchmark stereosgm_benchmark.cpp ${SRCS_COMMON})
+target_include_directories(stereosgm_benchmark PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(stereosgm_benchmark sgm ${OpenCV_LIBS})
+
+# sample reprojection
+add_executable(stereosgm_reprojection stereosgm_reprojection.cpp ${SRCS_COMMON})
+target_include_directories(stereosgm_reprojection PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(stereosgm_reprojection sgm ${OpenCV_LIBS})
+
+# sample image with cv::GpuMat
+if(BUILD_OPENCV_WRAPPER)
+	add_executable(stereosgm_image_cv_gpumat stereosgm_image_cv_gpumat.cpp ${SRCS_COMMON})
+	target_include_directories(stereosgm_image_cv_gpumat PRIVATE ${OpenCV_INCLUDE_DIRS})
+	target_link_libraries(stereosgm_image_cv_gpumat sgm ${OpenCV_LIBS})
+endif()
+
+# sample ZED camera
+if(ENABLE_ZED_DEMO)
+	if(WIN32)
+		set(ZED_SDK_LIB "C:\\Program Files (x86)\\ZED SDK\\lib\\sl_zed64.lib" CACHE STRING "ZED SDK library(sl_zed**.llb) path.")
+		set(ZED_SDK_INCLUDE_DIR "C:\\Program Files (x86)\\ZED SDK\\include" CACHE STRING "ZED SDK include path.")
+	else()
+		set(ZED_SDK_LIB "/usr/local/zed/lib/libsl_zed.so" CACHE STRING "ZED SDK library(sl_zed**.llb) path.")
+		set(ZED_SDK_INCLUDE_DIR "/usr/local/zed/include" CACHE STRING "ZED SDK include path.")
+	endif()
+
+	find_package(ZED 3 REQUIRED)
+	string(REGEX REPLACE [[; +]] [[;]] CUDA_NPP_LIBRARIES_ZED "${CUDA_NPP_LIBRARIES_ZED}")
+
+	add_executable(stereosgm_zed stereosgm_zed.cpp ${SRCS_COMMON})
+	target_include_directories(stereosgm_zed PRIVATE ${OpenCV_INCLUDE_DIRS} ${ZED_INCLUDE_DIRS})
+	target_link_directories(stereosgm_zed PRIVATE ${ZED_LIBRARY_DIR})
+	target_link_libraries(stereosgm_zed sgm ${OpenCV_LIBS} ${ZED_LIBRARIES} ${CUDA_NPP_LIBRARIES_ZED})
+endif()
@@ -0,0 +1,15 @@
+<?xml version="1.0"?>
+<opencv_storage>
+
+<!--  Intrinsic parameters -->
+<FocalLengthX>1267.485352</FocalLengthX> <!--  focal length x (pixel) -->
+<FocalLengthY>1224.548950</FocalLengthY> <!--  focal length y (pixel) -->
+<CenterX>472.735474</CenterX>            <!--  principal point x (pixel) -->
+<CenterY>175.787781</CenterY>            <!--  principal point y (pixel) -->
+
+<!--  Extrinsic parameters -->
+<BaseLine>0.214382</BaseLine>            <!--  baseline (meter) -->
+<Height>1.170000</Height>                <!--  height position (meter) -->
+<Tilt>0.081276</Tilt>                    <!--  tilt angle (radian) -->
+
+</opencv_storage>
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<opencv_storage>
+<FocalLengthX>1249.7700195</FocalLengthX>
+<FocalLengthY>1249.7700195</FocalLengthY>
+<CenterX>480.8460083</CenterX>
+<CenterY>237.4100037</CenterY>
+<BaseLine>0.2339240</BaseLine>
+<Height>1.2000000</Height>
+<Tilt>0.07</Tilt>
+</opencv_storage>
@@ -0,0 +1,4 @@
+# sample mynew
+add_executable(stereosgm_new stereosgm_new.cpp ${SRCS_COMMON})
+target_include_directories(stereosgm_new PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(stereosgm_new sgm ${OpenCV_LIBS})
@@ -0,0 +1,160 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <chrono>
+#include <stdexcept>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <fstream> // Add this line to use std::ofstream for file output
+
+#include <libsgm.h>
+
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left-image-format  | <none> | format string for path to input left image  }"
+"{ @right-image-format | <none> | format string for path to input right image }"
+"{ disp_size           |    256 | maximum possible disparity value            }"
+"{ start_number        |      0 | index to start reading                      }"
+"{ help h              |        | display this help and exit                  }";
+
+class ImagePreprocessor {
+public:
+    void preprocess_image_pair(cv::Mat& img_left, cv::Mat& img_right) {
+        // Get the shape of both images
+        int h1 = img_left.rows, w1 = img_left.cols;
+        int h2 = img_right.rows, w2 = img_right.cols;
+
+        // Find the minimum height and width between the two images
+        int min_height = std::min(h1, h2);
+        int min_width = std::min(w1, w2);
+
+        // Crop both images to match the minimum height and width
+        img_left = img_left(cv::Rect(0, 0, min_width, min_height));
+        img_right = img_right(cv::Rect(0, 0, min_width, min_height));
+
+        // Convert to CV_8U grayscale
+        //cv::cvtColor(img_left, img_left, cv::COLOR_BGR2GRAY);
+        img_left.convertTo(img_left, CV_8U); // Ensure it's in CV_8U format
+        //cv::cvtColor(img_right, img_right, cv::COLOR_BGR2GRAY);
+        img_right.convertTo(img_right, CV_8U); // Ensure it's in CV_8U format
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    cv::CommandLineParser parser(argc, argv, keys);
+    if (parser.has("help")) {
+        parser.printMessage();
+        return 0;
+    }
+
+    const std::string image_format_L = parser.get<cv::String>("@left-image-format");
+    const std::string image_format_R = parser.get<cv::String>("@right-image-format");
+    const int disp_size = parser.get<int>("disp_size");
+    const int start_number = parser.get<int>("start_number");
+
+    if (!parser.check()) {
+        parser.printErrors();
+        parser.printMessage();
+        std::exit(EXIT_FAILURE);
+    }
+
+    cv::Mat I1, I2;
+
+    ImagePreprocessor preprocessor; // Create an instance of the ImagePreprocessor class
+
+    for (int frame_no = start_number;; frame_no++) {
+        I1 = cv::imread(cv::format(image_format_L.c_str(), frame_no), cv::IMREAD_GRAYSCALE);
+        I2 = cv::imread(cv::format(image_format_R.c_str(), frame_no), cv::IMREAD_GRAYSCALE);
+        
+        // Check if images are empty, if so break the loop
+        if (I1.empty() || I2.empty()) {
+            std::cout << "No more images to process or image pair not found." << std::endl;
+            break;
+        }
+
+        // Preprocess the images
+        preprocessor.preprocess_image_pair(I1, I2);
+
+        const int width = I1.cols;
+        const int height = I1.rows;
+
+        const int src_depth = I1.type() == CV_8U ? 8 : 16;
+        const int dst_depth = disp_size < 256 ? 8 : 16;
+        const int src_bytes = src_depth * width * height / 8;
+        const int dst_bytes = dst_depth * width * height / 8;
+
+        sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA);
+
+        device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
+        cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color;
+
+        const int invalid_disp = sgm.get_invalid_disparity();
+
+        d_I1.upload(I1.data);
+        d_I2.upload(I2.data);
+
+        const auto t1 = std::chrono::system_clock::now();
+
+        sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+        cudaDeviceSynchronize();
+
+        const auto t2 = std::chrono::system_clock::now();
+        const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        const double fps = 1e6 / duration;
+
+        d_disparity.download(disparity.data);
+        cv::imwrite(cv::format("disparity_output_%04d.png", frame_no), disparity);
+
+        // Save disparity map as text file with pixel values
+        //std::ofstream disparity_file(cv::format("disparity_output_%04d.txt", frame_no));
+        //if (disparity_file.is_open()) {
+         //   for (int y = 0; y < disparity.rows; ++y) {
+          //      for (int x = 0; x < disparity.cols; ++x) {
+           //         disparity_file << disparity.at<short>(y, x) << " "; // Assuming disparity is CV_16S
+           //     }
+            //    disparity_file << std::endl;
+          //  }
+           // disparity_file.close();
+        //} else {
+          //  std::cerr << "Error: Could not open text file for disparity output." << std::endl;
+        //}
+        
+
+        // Print the size of the disparity map in MB
+        double disparity_size_mb = static_cast<double>(dst_bytes) / (1024 * 1024);
+        std::cout << "Size of disparity map: " << disparity_size_mb << " MB" << std::endl;
+
+        // Draw results
+        if (I1.type() != CV_8U)
+            cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
+
+        colorize_disparity(disparity, disparity_color, disp_size, disparity == invalid_disp);
+        cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
+            1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+        cv::imshow("left image", I1);
+        cv::imshow("disparity", disparity_color);
+
+        cv::waitKey(0); // Hold the window open for inspection; press any key to continue
+    }
+
+    return 0;
+}
@@ -0,0 +1,29 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "sample_common.h"
+
+#include <opencv2/imgproc.hpp>
+
+void colorize_disparity(const cv::Mat& src, cv::Mat& dst, int disp_size, cv::InputArray mask)
+{
+	cv::Mat tmp;
+	src.convertTo(tmp, CV_8U, 255. / disp_size);
+	cv::applyColorMap(tmp, dst, cv::COLORMAP_TURBO);
+
+	if (!mask.empty())
+		dst.setTo(0, mask);
+}
@@ -0,0 +1,45 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __SAMPLE_COMMON_H__
+#define __SAMPLE_COMMON_H__
+
+#include <opencv2/core.hpp>
+#include <cuda_runtime.h>
+
+#define ASSERT_MSG(expr, msg) \
+if (!(expr)) { \
+	std::cerr << msg << std::endl; \
+	std::exit(EXIT_FAILURE); \
+} \
+
+struct device_buffer
+{
+	device_buffer() : data(nullptr), size(0) {}
+	device_buffer(size_t count) : device_buffer() { allocate(count); }
+	~device_buffer() { cudaFree(data); }
+
+	void allocate(size_t count) { cudaMalloc(&data, count); size = count; }
+	void upload(const void* h_data) { cudaMemcpy(data, h_data, size, cudaMemcpyHostToDevice); }
+	void download(void* h_data) { cudaMemcpy(h_data, data, size, cudaMemcpyDeviceToHost); }
+
+	void* data;
+	size_t size;
+};
+
+void colorize_disparity(const cv::Mat& src, cv::Mat& dst, int disp_size, cv::InputArray mask = cv::noArray());
+
+#endif // !__SAMPLE_COMMON_H__
@@ -0,0 +1,140 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <chrono>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include <libsgm.h>
+
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left_img   | <none> | path to input left image                                       }"
+"{ @right_img  | <none> | path to input right image                                      }"
+"{ disp_size   |    128 | maximum possible disparity value                               }"
+"{ out_depth   |      8 | disparity image's bits per pixel                               }"
+"{ subpixel    |        | enable subpixel estimation                                     }"
+"{ num_paths   |      8 | number of scanlines used in cost aggregation                   }"
+"{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7) }"
+"{ iterations  |    100 | number of iterations for measuring performance                 }"
+"{ help h      |        | display this help and exit                                     }";
+
+int main(int argc, char* argv[])
+{
+	cv::CommandLineParser parser(argc, argv, keys);
+	if (parser.has("help")) {
+		parser.printMessage();
+		return 0;
+	}
+
+	cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
+	cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
+
+	const int disp_size = parser.get<int>("disp_size");
+	const int dst_depth = parser.get<int>("out_depth");
+	const bool subpixel = parser.has("subpixel");
+	const int num_paths = parser.get<int>("num_paths");
+	const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
+	const int iterations = parser.get<int>("iterations");
+
+	if (!parser.check()) {
+		parser.printErrors();
+		parser.printMessage();
+		std::exit(EXIT_FAILURE);
+	}
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+	ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
+	ASSERT_MSG(census_type == sgm::CensusType::CENSUS_9x7 || census_type == sgm::CensusType::SYMMETRIC_CENSUS_9x7, "census type must be 0 or 1.");
+	ASSERT_MSG(dst_depth == 8 || dst_depth == 16, "output depth bits must be 8 or 16");
+	if (subpixel)
+		ASSERT_MSG(dst_depth == 16, "output depth bits must be 16 if subpixel option is enabled.");
+
+	const int width = I1.cols;
+	const int height = I1.rows;
+
+	const int src_depth = I1.type() == CV_8U ? 8 : 16;
+	const int src_bytes = src_depth * width * height / 8;
+	const int dst_bytes = dst_depth * width * height / 8;
+	const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
+
+	const sgm::StereoSGM::Parameters param(10, 120, 0.95f, subpixel, path_type, 0, 1, census_type);
+	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, param);
+
+	device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
+	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S);
+
+	d_I1.upload(I1.data);
+	d_I2.upload(I2.data);
+
+	cudaDeviceProp prop;
+	int version;
+	cudaGetDeviceProperties(&prop, 0);
+	cudaRuntimeGetVersion(&version);
+
+	// show settings
+	std::cout << "# Settings" << std::endl;
+	std::cout << "device name         : " << prop.name << std::endl;
+	std::cout << "CUDA runtime version: " << version << std::endl;
+	std::cout << "image size          : " << I1.size() << std::endl;
+	std::cout << "disparity size      : " << disp_size << std::endl;
+	std::cout << "output depth        : " << dst_depth << std::endl;
+	std::cout << "subpixel option     : " << (subpixel ? "true" : "false") << std::endl;
+	std::cout << "sgm path            : " << num_paths << " path" << std::endl;
+	std::cout << "census type         : " << (census_type == sgm::CensusType::CENSUS_9x7 ? "CENSUS_9x7" : "SYMMETRIC_CENSUS_9x7") << std::endl;
+	std::cout << "iterations          : " << iterations << std::endl;
+	std::cout << std::endl;
+
+	// run benchmark
+	std::cout << "Running benchmark..." << std::endl;
+	uint64_t sum = 0;
+	for (int i = 0; i <= iterations; i++) {
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		if (i > 0)
+			sum += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+	}
+	std::cout << "Done." << std::endl << std::endl;
+
+	// show results
+	const double time_millisec = 1e-3 * sum / iterations;
+	const double fps = 1e3 / time_millisec;
+	std::cout << "# Results" << std::endl;
+	std::cout.setf(std::ios::fixed);
+	std::cout << std::setprecision(1) << "Processing Time[Milliseconds]: " << time_millisec << std::endl;
+	std::cout << std::setprecision(1) << "FPS                          : " << fps << std::endl;
+	std::cout << std::endl;
+
+	// save disparity image
+	const int disp_scale = subpixel ? sgm::StereoSGM::SUBPIXEL_SCALE : 1;
+	d_disparity.download(disparity.data);
+	colorize_disparity(disparity, disparity, disp_scale * disp_size, disparity == sgm.get_invalid_disparity());
+	cv::imwrite("disparity.png", disparity);
+
+	return 0;
+}
@@ -0,0 +1,118 @@
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/core/utils/filesystem.hpp>
+#include <libsgm.h>
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left-image-format  | <none> | format string for path to input left image  }"
+"{ @right-image-format | <none> | format string for path to input right image }"
+"{ disp_size           |    256 | maximum possible disparity value            }"
+"{ P1                  |     10 | penalty on the disparity change by plus or minus 1 }"
+"{ P2                  |    120 | penalty on the disparity change by more than 1 }"
+"{ uniqueness          |   0.80 | margin in ratio for best cost function value }"
+"{ num_paths           |      8 | number of scanlines used in cost aggregation }"
+"{ min_disp            |   -160 | minimum disparity value                     }"
+"{ LR_max_diff         |      1 | max allowed difference between L/R disparity }"
+"{ census_type         |      1 | type of census transform                    }"
+"{ interval            |      1 | polling interval in seconds                 }"
+"{ help h              |        | display this help and exit                  }";
+
+class ImagePreprocessor {
+public:
+    void preprocess_image_pair(cv::Mat& img_left, cv::Mat& img_right) {
+        if (img_left.channels() > 1) cv::cvtColor(img_left, img_left, cv::COLOR_BGR2GRAY);
+        if (img_right.channels() > 1) cv::cvtColor(img_right, img_right, cv::COLOR_BGR2GRAY);
+
+        int min_height = std::min(img_left.rows, img_right.rows);
+        int min_width = std::min(img_left.cols, img_right.cols);
+        img_left = img_left(cv::Rect(0, 0, min_width, min_height));
+        img_right = img_right(cv::Rect(0, 0, min_width, min_height));
+    }
+};
+
+bool disparityAlreadyProcessed(int frame_no) {
+    std::string xml_path = cv::format("output/disparity_%04d.xml", frame_no);
+    return cv::utils::fs::exists(xml_path);
+}
+
+int main(int argc, char* argv[]) {
+    cv::CommandLineParser parser(argc, argv, keys);
+    if (parser.has("help")) {
+        parser.printMessage();
+        return 0;
+    }
+
+    const std::string format_L = parser.get<cv::String>("@left-image-format");
+    const std::string format_R = parser.get<cv::String>("@right-image-format");
+    const int disp_size = parser.get<int>("disp_size");
+    const int P1 = parser.get<int>("P1");
+    const int P2 = parser.get<int>("P2");
+    const float uniqueness = parser.get<float>("uniqueness");
+    const int num_paths = parser.get<int>("num_paths");
+    const int min_disp = parser.get<int>("min_disp");
+    const int LR_max_diff = parser.get<int>("LR_max_diff");
+    const int interval = parser.get<int>("interval");
+    const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
+
+    if (!parser.check()) {
+        parser.printErrors();
+        parser.printMessage();
+        std::exit(EXIT_FAILURE);
+    }
+
+    if (!cv::utils::fs::exists("output")) {
+        cv::utils::fs::createDirectory("output");
+    }
+
+    ImagePreprocessor preprocessor;
+    const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
+    const sgm::StereoSGM::Parameters param(P1, P2, uniqueness, false, path_type, min_disp, LR_max_diff, census_type);
+
+    int last_checked = 0;
+    while (true) {
+        const std::string left_path = cv::format(format_L.c_str(), last_checked);
+        const std::string right_path = cv::format(format_R.c_str(), last_checked);
+
+        if (cv::utils::fs::exists(left_path) && cv::utils::fs::exists(right_path) && !disparityAlreadyProcessed(last_checked)) {
+            cv::TickMeter timer;
+            timer.start();
+            std::cout << "Processing frame " << last_checked;
+
+            cv::Mat I1 = cv::imread(left_path, cv::IMREAD_UNCHANGED);
+            cv::Mat I2 = cv::imread(right_path, cv::IMREAD_UNCHANGED);
+
+            if (I1.empty() || I2.empty()) {
+                std::cerr << "Error reading images." << std::endl;
+                break;
+            }
+
+            preprocessor.preprocess_image_pair(I1, I2);
+
+            ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "Mismatched image size/type.");
+            ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "Images must be CV_8U or CV_16U.");
+
+            const int src_depth = I1.type() == CV_8U ? 8 : 16;
+            const int dst_depth = 16;
+
+            sgm::StereoSGM ssgm(I1.cols, I1.rows, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_HOST2HOST, param);
+            cv::Mat disparity(I1.size(), CV_16S);
+            ssgm.execute(I1.data, I2.data, disparity.data);
+
+            cv::FileStorage fs(cv::format("output/disparity_%04d.xml", last_checked), cv::FileStorage::WRITE);
+            fs << "disparity" << disparity;
+            fs.release();
+
+            timer.stop();
+            std::cout << " - " << timer.getTimeSec() << " seconds" << std::endl;
+        }
+
+        last_checked++;
+        cv::waitKey(interval * 1000);  // Sleep for polling interval
+    }
+
+    return 0;
+}
+
@@ -0,0 +1,120 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include <libsgm.h>
+
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left_img   | <none> | path to input left image                                                            }"
+"{ @right_img  | <none> | path to input right image                                                           }"
+"{ disp_size   |     64 | maximum possible disparity value                                                    }"
+"{ P1          |     10 | penalty on the disparity change by plus or minus 1 between nieghbor pixels          }"
+"{ P2          |    120 | penalty on the disparity change by more than 1 between neighbor pixels              }"
+"{ uniqueness  |   0.95 | margin in ratio by which the best cost function value should be at least second one }"
+"{ num_paths   |      8 | number of scanlines used in cost aggregation                                        }"
+"{ min_disp    |      0 | minimum disparity value                                                             }"
+"{ LR_max_diff |      1 | maximum allowed difference between left and right disparity                         }"
+"{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7)                      }"
+"{ help h      |        | display this help and exit                                                          }";
+
+int main(int argc, char* argv[])
+{
+	cv::CommandLineParser parser(argc, argv, keys);
+	if (parser.has("help")) {
+		parser.printMessage();
+		return 0;
+	}
+
+	cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
+	cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
+
+	const int disp_size = parser.get<int>("disp_size");
+	const int P1 = parser.get<int>("P1");
+	const int P2 = parser.get<int>("P2");
+	const float uniqueness = parser.get<float>("uniqueness");
+	const int num_paths = parser.get<int>("num_paths");
+	const int min_disp = parser.get<int>("min_disp");
+	const int LR_max_diff = parser.get<int>("LR_max_diff");
+	const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
+
+	if (!parser.check()) {
+		parser.printErrors();
+		parser.printMessage();
+		std::exit(EXIT_FAILURE);
+	}
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+	ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
+	ASSERT_MSG(census_type == sgm::CensusType::CENSUS_9x7 || census_type == sgm::CensusType::SYMMETRIC_CENSUS_9x7, "census type must be 0 or 1.");
+
+	const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
+	sgm::LibSGMWrapper sgm(disp_size, P1, P2, uniqueness, false, path_type, min_disp, LR_max_diff, census_type);
+	cv::Mat disparity;
+
+	try {
+		cv::cuda::GpuMat d_I1(I1), d_I2(I2), d_disparity;
+		sgm.execute(d_I1, d_I2, d_disparity);
+		d_disparity.download(disparity);
+	}
+	catch (const cv::Exception& e) {
+		std::cerr << e.what() << std::endl;
+		return e.code == cv::Error::GpuNotSupported ? 1 : -1;
+	}
+
+	// create mask for invalid disp
+	const cv::Mat mask = disparity == sgm.getInvalidDisparity();
+
+	// show image
+	cv::Mat disparity_8u, disparity_color;
+	disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size);
+	cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_TURBO);
+	disparity_8u.setTo(0, mask);
+	disparity_color.setTo(cv::Scalar::all(0), mask);
+	if (I1.type() != CV_8U)
+		cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
+
+	const std::vector<cv::Mat> images = { disparity_8u, disparity_color, I1 };
+	const std::vector<std::string> titles = { "disparity", "disparity color", "input" };
+
+	std::cout << "Hot keys:" << std::endl;
+	std::cout << "\tESC - quit the program" << std::endl;
+	std::cout << "\ts - switch display (disparity | colored disparity | input image)" << std::endl;
+
+	int mode = 0;
+	while (true) {
+
+		cv::setWindowTitle("image", titles[mode]);
+		cv::imshow("image", images[mode]);
+
+		const char c = cv::waitKey(0);
+		if (c == 's')
+			mode = (mode < 2 ? mode + 1 : 0);
+		if (c == 27)
+			break;
+	}
+
+	return 0;
+}
@@ -0,0 +1,121 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <chrono>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include <libsgm.h>
+
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left-image-format  | <none> | format string for path to input left image  }"
+"{ @right-image-format | <none> | format string for path to input right image }"
+"{ disp_size           |    128 | maximum possible disparity value            }"
+"{ start_number        |      0 | index to start reading                      }"
+"{ help h              |        | display this help and exit                  }";
+
+int main(int argc, char* argv[])
+{
+	cv::CommandLineParser parser(argc, argv, keys);
+	if (parser.has("help")) {
+		parser.printMessage();
+		return 0;
+	}
+
+	const std::string image_format_L = parser.get<cv::String>("@left-image-format");
+	const std::string image_format_R = parser.get<cv::String>("@right-image-format");
+	const int disp_size = parser.get<int>("disp_size");
+	const int start_number = parser.get<int>("start_number");
+
+	if (!parser.check()) {
+		parser.printErrors();
+		parser.printMessage();
+		std::exit(EXIT_FAILURE);
+	}
+
+	cv::Mat I1 = cv::imread(cv::format(image_format_L.c_str(), start_number), cv::IMREAD_UNCHANGED);
+	cv::Mat I2 = cv::imread(cv::format(image_format_R.c_str(), start_number), cv::IMREAD_UNCHANGED);
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
+	if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+
+	const int width = I1.cols;
+	const int height = I1.rows;
+
+	const int src_depth = I1.type() == CV_8U ? 8 : 16;
+	const int dst_depth = disp_size < 256 ? 8 : 16;
+	const int src_bytes = src_depth * width * height / 8;
+	const int dst_bytes = dst_depth * width * height / 8;
+
+	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA);
+
+	device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
+	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color;
+
+	const int invalid_disp = sgm.get_invalid_disparity();
+
+	for (int frame_no = start_number;; frame_no++) {
+
+		I1 = cv::imread(cv::format(image_format_L.c_str(), frame_no), cv::IMREAD_UNCHANGED);
+		I2 = cv::imread(cv::format(image_format_R.c_str(), frame_no), cv::IMREAD_UNCHANGED);
+		if (I1.empty() || I2.empty()) {
+			frame_no = start_number - 1;
+			continue;
+		}
+		if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
+		if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
+
+		d_I1.upload(I1.data);
+		d_I2.upload(I2.data);
+
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+		const double fps = 1e6 / duration;
+
+		d_disparity.download(disparity.data);
+
+		// draw results
+		if (I1.type() != CV_8U)
+			cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
+
+		colorize_disparity(disparity, disparity_color, disp_size, disparity == invalid_disp);
+		cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
+			1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+		cv::imshow("left image", I1);
+		cv::imshow("disparity", disparity_color);
+
+		const char c = cv::waitKey(1);
+		if (c == 27) // ESC
+			break;
+	}
+
+	return 0;
+}
@@ -0,0 +1,124 @@
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <libsgm.h>
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left_img   | <none> | path to input left image                                                            }"
+"{ @right_img  | <none> | path to input right image                                                           }"
+"{ disp_size   |    256 | maximum possible disparity value                                                    }"
+"{ P1          |     10 | penalty on the disparity change by plus or minus 1 between neighbor pixels          }"
+"{ P2          |    120 | penalty on the disparity change by more than 1 between neighbor pixels              }"
+"{ uniqueness  |   0.80 | margin in ratio by which the best cost function value should be at least second one }"
+"{ num_paths   |      8 | number of scanlines used in cost aggregation                                        }"
+"{ min_disp    |   -160 | minimum disparity value                                                             }"
+"{ LR_max_diff |      1 | maximum allowed difference between left and right disparity                         }"
+"{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7)                      }"
+"{ output_dir  |      . | directory to save disparity.xml and disparity_color.png                             }"
+"{ no_display  |      0 | set to 1 to skip interactive display window (for pipeline/headless use)             }"
+"{ help h      |        | display this help and exit                                                          }";
+
+int main(int argc, char* argv[])
+{
+    double start_time = cv::getTickCount();  // Start total execution time
+
+    cv::CommandLineParser parser(argc, argv, keys);
+    if (parser.has("help")) {
+        parser.printMessage();
+        return 0;
+    }
+
+    double load_start = cv::getTickCount(); // Start loading time
+    cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
+    cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
+    double load_end = cv::getTickCount();
+    double load_time_s = (load_end - load_start) / cv::getTickFrequency();  // Seconds
+    double load_time_ms = load_time_s * 1000.0;  // Milliseconds
+    std::cout << "Image Loading Time: " << load_time_s << " s (" << load_time_ms << " ms)" << std::endl;
+
+    if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
+    if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
+
+    const int disp_size = parser.get<int>("disp_size");
+    const int P1 = parser.get<int>("P1");
+    const int P2 = parser.get<int>("P2");
+    const float uniqueness = parser.get<float>("uniqueness");
+    const int num_paths = parser.get<int>("num_paths");
+    const int min_disp = parser.get<int>("min_disp");
+    const int LR_max_diff = parser.get<int>("LR_max_diff");
+    const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
+
+    if (!parser.check()) {
+        parser.printErrors();
+        parser.printMessage();
+        std::exit(EXIT_FAILURE);
+    }
+
+    ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+    ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+    ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+    ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+    ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
+    ASSERT_MSG(census_type == sgm::CensusType::CENSUS_9x7 || census_type == sgm::CensusType::SYMMETRIC_CENSUS_9x7, "census type must be 0 or 1.");
+
+    const int src_depth = I1.type() == CV_8U ? 8 : 16;
+    const int dst_depth = 16;
+    const sgm::PathType path_type = num_paths == 8 ? sgm::PathType::SCAN_8PATH : sgm::PathType::SCAN_4PATH;
+
+    const sgm::StereoSGM::Parameters param(P1, P2, uniqueness, false, path_type, min_disp, LR_max_diff, census_type);
+    sgm::StereoSGM ssgm(I1.cols, I1.rows, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_HOST2HOST, param);
+
+    cv::Mat disparity(I1.size(), CV_16S);
+
+    double disparity_start = cv::getTickCount(); // Start disparity computation time
+    ssgm.execute(I1.data, I2.data, disparity.data);
+    double disparity_end = cv::getTickCount();
+    double disparity_time_s = (disparity_end - disparity_start) / cv::getTickFrequency();  // Seconds
+    double disparity_time_ms = disparity_time_s * 1000.0;  // Milliseconds
+    std::cout << "Disparity Computation Time: " << disparity_time_s << " s (" << disparity_time_ms << " ms)" << std::endl;
+
+    const std::string output_dir = parser.get<std::string>("output_dir");
+
+    // Save disparity
+    cv::FileStorage fs(output_dir + "/disparity.xml", cv::FileStorage::WRITE);
+    fs << "disparity" << disparity;
+    fs.release();
+
+    // Convert disparity to 8-bit for visualization
+    cv::Mat disparity_8u, disparity_color;
+    disparity.convertTo(disparity_8u, CV_8U, 255.0 / disp_size);
+    cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_TURBO);
+
+    // Save colored disparity image
+    cv::imwrite(output_dir + "/disparity_color.png", disparity_color);
+
+    double total_end = cv::getTickCount();
+    double total_time_s = (total_end - start_time) / cv::getTickFrequency();  // Seconds
+    double total_time_ms = total_time_s * 1000.0;  // Milliseconds
+    std::cout << "Total Execution Time: " << total_time_s << " s (" << total_time_ms << " ms)" << std::endl;
+
+    // Display images
+    const std::vector<cv::Mat> images = { disparity_8u, disparity_color, I1 };
+    const std::vector<std::string> titles = { "Disparity", "Colored Disparity", "Input Image" };
+
+    if (!parser.get<int>("no_display")) {
+        std::cout << "Hot keys:\n";
+        std::cout << "\tESC - Quit the program\n";
+        std::cout << "\ts - Switch display (Disparity | Colored Disparity | Input Image)\n";
+
+        int mode = 0;
+        while (true) {
+            cv::setWindowTitle("Image", titles[mode]);
+            cv::imshow("Image", images[mode]);
+
+            const char c = cv::waitKey(0);
+            if (c == 's') mode = (mode < 2 ? mode + 1 : 0);
+            if (c == 27) break;
+        }
+    }
+
+    return 0;
+}
+
@@ -0,0 +1,120 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+#include <libsgm.h>
+#include "sample_common.h"
+
+static const std::string keys =
+"{ @left_img   | <none> | path to input left image                                                            }"
+"{ @right_img  | <none> | path to input right image                                                           }"
+"{ disp_size   |     64 | maximum possible disparity value                                                    }"
+"{ P1          |     10 | penalty on the disparity change by plus or minus 1 between neighbor pixels          }"
+"{ P2          |    120 | penalty on the disparity change by more than 1 between neighbor pixels              }"
+"{ uniqueness  |   0.95 | margin in ratio by which the best cost function value should be at least second one }"
+"{ num_paths   |      8 | number of scanlines used in cost aggregation                                        }"
+"{ min_disp    |      0 | minimum disparity value                                                             }"
+"{ LR_max_diff |      1 | maximum allowed difference between left and right disparity                         }"
+"{ census_type |      1 | type of census transform (0:CENSUS_9x7 1:SYMMETRIC_CENSUS_9x7)                      }"
+"{ help h      |        | display this help and exit                                                          }";
+
+int main(int argc, char* argv[]) {
+    cv::CommandLineParser parser(argc, argv, keys);
+    if (parser.has("help")) {
+        parser.printMessage();
+        return 0;
+    }
+
+    cv::Mat I1 = cv::imread(parser.get<cv::String>("@left_img"), cv::IMREAD_UNCHANGED);
+    cv::Mat I2 = cv::imread(parser.get<cv::String>("@right_img"), cv::IMREAD_UNCHANGED);
+
+    // Preprocessing: Convert images to grayscale if necessary
+    if (I1.channels() > 1) cv::cvtColor(I1, I1, cv::COLOR_BGR2GRAY);
+    if (I2.channels() > 1) cv::cvtColor(I2, I2, cv::COLOR_BGR2GRAY);
+
+    // Ensure images have the same size by cropping
+    int new_width = std::min(I1.cols, I2.cols);
+    int new_height = std::min(I1.rows, I2.rows);
+    I1 = I1(cv::Rect(0, 0, new_width, new_height));
+    I2 = I2(cv::Rect(0, 0, new_width, new_height));
+
+    const int disp_size = parser.get<int>("disp_size");
+    const int P1 = parser.get<int>("P1");
+    const int P2 = parser.get<int>("P2");
+    const float uniqueness = parser.get<float>("uniqueness");
+    const int num_paths = parser.get<int>("num_paths");
+    const int min_disp = parser.get<int>("min_disp");
+    const int LR_max_diff = parser.get<int>("LR_max_diff");
+    const auto census_type = static_cast<sgm::CensusType>(parser.get<int>("census_type"));
+
+    if (!parser.check()) {
+        parser.printErrors();
+        parser.printMessage();
+        std::exit(EXIT_FAILURE);
+    }
+
+    ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+    ASSERT_MSG(I1.size() == I2.size(), "input images must be the same size.");
+    ASSERT_MSG(I1.type() == CV_8U, "input image format must be CV_8U.");
+    ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+    ASSERT_MSG(num_paths == 4 || num_paths == 8, "number of scanlines must be 4 or 8.");
+
+    const sgm::StereoSGM::Parameters param(P1, P2, uniqueness, false, sgm::PathType::SCAN_8PATH, min_disp, LR_max_diff, census_type);
+    sgm::StereoSGM ssgm(I1.cols, I1.rows, disp_size, 8, 16, sgm::EXECUTE_INOUT_HOST2HOST, param);
+
+    cv::Mat disparity(I1.size(), CV_16S);
+    ssgm.execute(I1.data, I2.data, disparity.data);
+
+    // Convert disparity to 8-bit and apply colormap
+    cv::Mat disparity_8u, disparity_color;
+    disparity.convertTo(disparity_8u, CV_8U, 255. / disp_size);
+    cv::applyColorMap(disparity_8u, disparity_color, cv::COLORMAP_TURBO);
+
+    // Save disparity map
+    cv::imwrite("disparity_map.png", disparity_8u);
+
+    // Optionally save disparity values as a text file
+    std::ofstream file("disparity_values.txt");
+    if (file.is_open()) {
+        for (int i = 0; i < disparity.rows; ++i) {
+            for (int j = 0; j < disparity.cols; ++j) {
+                file << static_cast<int>(disparity.at<int16_t>(i, j)) << " ";
+            }
+            file << "\n";
+        }
+        file.close();
+    }
+
+    std::cout << "Hot keys:\n\tESC - quit the program\n\ts - switch display (disparity | colored disparity | input image)\n";
+
+    const std::vector<cv::Mat> images = { disparity_8u, disparity_color, I1 };
+    const std::vector<std::string> titles = { "disparity", "disparity color", "input" };
+
+    int mode = 0;
+    while (true) {
+        cv::setWindowTitle("image", titles[mode]);
+        cv::imshow("image", images[mode]);
+        char c = cv::waitKey(0);
+        if (c == 's') mode = (mode + 1) % 3;
+        if (c == 27) break;
+    }
+
+    return 0;
+}
+
@@ -0,0 +1,253 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <chrono>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include <libsgm.h>
+
+#include "sample_common.h"
+
+// Camera Parameters
+struct CameraParameters
+{
+	float fu;                 //!< focal length x (pixel)
+	float fv;                 //!< focal length y (pixel)
+	float u0;                 //!< principal point x (pixel)
+	float v0;                 //!< principal point y (pixel)
+	float baseline;           //!< baseline (meter)
+	float height;             //!< height position (meter), ignored when ROAD_ESTIMATION_AUTO
+	float tilt;               //!< tilt angle (radian), ignored when ROAD_ESTIMATION_AUTO
+};
+
+// Transformation between pixel coordinate and world coordinate
+struct CoordinateTransform
+{
+	CoordinateTransform(const CameraParameters& camera) : camera(camera)
+	{
+		sinTilt = sinf(camera.tilt);
+		cosTilt = cosf(camera.tilt);
+		bf = camera.baseline * camera.fu;
+		invfu = 1.f / camera.fu;
+		invfv = 1.f / camera.fv;
+	}
+
+	inline cv::Point3f imageToWorld(const cv::Point2f& pt, float d) const
+	{
+		const float u = pt.x;
+		const float v = pt.y;
+
+		const float Zc = bf / d;
+		const float Xc = invfu * (u - camera.u0) * Zc;
+		const float Yc = invfv * (v - camera.v0) * Zc;
+
+		const float Xw = Xc;
+		const float Yw = Yc * cosTilt + Zc * sinTilt;
+		const float Zw = Zc * cosTilt - Yc * sinTilt;
+
+		return cv::Point3f(Xw, Yw, Zw);
+	}
+
+	CameraParameters camera;
+	float sinTilt, cosTilt, bf, invfu, invfv;
+};
+
+void reprojectPointsTo3D(const cv::Mat& disparity, const CameraParameters& camera, std::vector<cv::Point3f>& points, bool subpixeled)
+{
+	CV_Assert(disparity.type() == CV_32F);
+
+	CoordinateTransform tf(camera);
+
+	points.clear();
+	points.reserve(disparity.rows * disparity.cols);
+
+	for (int y = 0; y < disparity.rows; y++)
+	{
+		for (int x = 0; x < disparity.cols; x++)
+		{
+			const float d = disparity.at<float>(y, x);
+			if (d > 0)
+				points.push_back(tf.imageToWorld(cv::Point(x, y), d));
+		}
+	}
+}
+
+static cv::Vec3b computeColor(float val)
+{
+	const float hscale = 6.f;
+	float h = 0.6f * (1.f - val), s = 1.f, v = 1.f;
+
+	static const int sector_data[][3] =
+	{ { 1,3,0 },{ 1,0,2 },{ 3,0,1 },{ 0,2,1 },{ 0,1,3 },{ 2,1,0 } };
+	float tab[4];
+	int sector;
+	h *= hscale;
+	if (h < 0)
+		do h += 6; while (h < 0);
+	else if (h >= 6)
+		do h -= 6; while (h >= 6);
+	sector = cvFloor(h);
+	h -= sector;
+	if ((unsigned)sector >= 6u)
+	{
+		sector = 0;
+		h = 0.f;
+	}
+
+	tab[0] = v;
+	tab[1] = v * (1.f - s);
+	tab[2] = v * (1.f - s * h);
+	tab[3] = v * (1.f - s * (1.f - h));
+
+	const uchar b = (uchar)(255 * tab[sector_data[sector][0]]);
+	const uchar g = (uchar)(255 * tab[sector_data[sector][1]]);
+	const uchar r = (uchar)(255 * tab[sector_data[sector][2]]);
+	return cv::Vec3b(b, g, r);
+}
+
+void drawPoints3D(const std::vector<cv::Point3f>& points, cv::Mat& draw)
+{
+	const int SIZE_X = 512;
+	const int SIZE_Z = 1024;
+	const int maxz = 20; // [meter]
+	const double pixelsPerMeter = 1. * SIZE_Z / maxz;
+
+	draw = cv::Mat::zeros(SIZE_Z, SIZE_X, CV_8UC3);
+
+	const int tableSize = 256;
+	const float scaleZ = 1.f * (tableSize - 1) / maxz;
+	static std::vector<cv::Vec3b> colorTable;
+	if (colorTable.empty())
+	{
+		colorTable.resize(tableSize);
+		for (int i = 0; i < tableSize; i++)
+			colorTable[i] = computeColor(1.f * i / tableSize);
+	}
+
+	for (const cv::Point3f& pt : points)
+	{
+		const float X = pt.x;
+		const float Z = pt.z;
+
+		const int u = cvRound(pixelsPerMeter * X) + SIZE_X / 2;
+		const int v = SIZE_Z - cvRound(pixelsPerMeter * Z);
+
+		const auto& color = colorTable[cvRound(scaleZ * std::min(Z, 1.f * maxz))];
+		cv::circle(draw, cv::Point(u, v), 1, color);
+	}
+}
+
+int main(int argc, char* argv[])
+{
+	if (argc < 4) {
+		std::cout << "usage: " << argv[0] << " left-image-format right-image-format camera.xml [disp_size] [subpixel_enable(0: false, 1:true)]" << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	const int start_number = 1;
+
+	cv::Mat I1 = cv::imread(cv::format(argv[1], start_number), cv::IMREAD_UNCHANGED);
+	cv::Mat I2 = cv::imread(cv::format(argv[2], start_number), cv::IMREAD_UNCHANGED);
+
+	const cv::FileStorage fs(argv[3], cv::FileStorage::READ);
+	const int disp_size = argc >= 5 ? std::stoi(argv[4]) : 128;
+	const bool subpixel = argc >= 6 ? std::stoi(argv[5]) != 0 : true;
+
+	ASSERT_MSG(!I1.empty() && !I2.empty(), "imread failed.");
+	ASSERT_MSG(fs.isOpened(), "camera.xml read failed.");
+	ASSERT_MSG(I1.size() == I2.size() && I1.type() == I2.type(), "input images must be same size and type.");
+	ASSERT_MSG(I1.type() == CV_8U || I1.type() == CV_16U, "input image format must be CV_8U or CV_16U.");
+	ASSERT_MSG(disp_size == 64 || disp_size == 128 || disp_size == 256, "disparity size must be 64, 128 or 256.");
+
+	// read camera parameters
+	CameraParameters camera;
+	camera.fu = fs["FocalLengthX"];
+	camera.fv = fs["FocalLengthY"];
+	camera.u0 = fs["CenterX"];
+	camera.v0 = fs["CenterY"];
+	camera.baseline = fs["BaseLine"];
+	camera.tilt = fs["Tilt"];
+
+	const int width = I1.cols;
+	const int height = I1.rows;
+
+	const int src_depth = I1.type() == CV_8U ? 8 : 16;
+	const int dst_depth = 16;
+	const int src_bytes = src_depth * width * height / 8;
+	const int dst_bytes = dst_depth * width * height / 8;
+
+	const sgm::StereoSGM::Parameters param(10, 120, 0.95f, subpixel);
+	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, sgm::EXECUTE_INOUT_CUDA2CUDA, param);
+
+	device_buffer d_I1(src_bytes), d_I2(src_bytes), d_disparity(dst_bytes);
+	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color, disparity_32f, draw;
+	std::vector<cv::Point3f> points;
+
+	const int invalid_disp = sgm.get_invalid_disparity();
+	const int disp_scale = subpixel ? sgm::StereoSGM::SUBPIXEL_SCALE : 1;
+
+	for (int frame_no = start_number;; frame_no++) {
+
+		I1 = cv::imread(cv::format(argv[1], frame_no), cv::IMREAD_UNCHANGED);
+		I2 = cv::imread(cv::format(argv[2], frame_no), cv::IMREAD_UNCHANGED);
+		if (I1.empty() || I2.empty()) {
+			frame_no = start_number - 1;
+			continue;
+		}
+
+		d_I1.upload(I1.data);
+		d_I2.upload(I2.data);
+
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_I1.data, d_I2.data, d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+		const double fps = 1e6 / duration;
+
+		d_disparity.download(disparity.data);
+
+		// reproject points
+		disparity.convertTo(disparity_32f, CV_32F, 1. / disp_scale);
+		reprojectPointsTo3D(disparity_32f, camera, points, subpixel);
+
+		// draw results
+		if (I1.type() != CV_8U)
+			cv::normalize(I1, I1, 0, 255, cv::NORM_MINMAX, CV_8U);
+
+		colorize_disparity(disparity, disparity_color, disp_scale * disp_size, disparity == invalid_disp);
+		cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
+			1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+		drawPoints3D(points, draw);
+
+		cv::imshow("left image", I1);
+		cv::imshow("disparity", disparity_color);
+		cv::imshow("points", draw);
+
+		const char c = cv::waitKey(1);
+		if (c == 27) // ESC
+			break;
+	}
+
+	return 0;
+}
@@ -0,0 +1,114 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <iostream>
+#include <chrono>
+
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/highgui.hpp>
+
+#include <sl/Camera.hpp>
+
+#include <libsgm.h>
+
+#include "sample_common.h"
+
+static const std::string keys =
+"{ disp_size           |    128 | maximum possible disparity value                  }"
+"{ camera_resolution   |      3 | camera resolution (0:HD2K 1:HD1080 2:HD720 3:VGA) }"
+"{ help h              |        | display this help and exit                        }";
+
+int main(int argc, char* argv[])
+{
+	cv::CommandLineParser parser(argc, argv, keys);
+	if (parser.has("help")) {
+		parser.printMessage();
+		return 0;
+	}
+
+	const int disp_size = parser.get<int>("disp_size");
+	const sl::RESOLUTION camera_resolution = parser.get<sl::RESOLUTION>("camera_resolution");
+
+	sl::Camera zed;
+	sl::InitParameters initParameters;
+	initParameters.camera_resolution = camera_resolution;
+	const sl::ERROR_CODE err = zed.open(initParameters);
+	if (err != sl::ERROR_CODE::SUCCESS) {
+		std::cerr << sl::toString(err) << std::endl;
+		std::exit(EXIT_FAILURE);
+	}
+
+	const auto& resolution = zed.getCameraInformation().camera_configuration.resolution;
+	sl::Mat d_zed_image_L(resolution, sl::MAT_TYPE::U8_C1, sl::MEM::GPU);
+	sl::Mat d_zed_image_R(resolution, sl::MAT_TYPE::U8_C1, sl::MEM::GPU);
+	CV_Assert(d_zed_image_L.getStep(sl::MEM::GPU) == d_zed_image_R.getStep(sl::MEM::GPU));
+
+	const int width = resolution.width;
+	const int height = resolution.height;
+	const int src_pitch = static_cast<int>(d_zed_image_L.getStep(sl::MEM::GPU));
+	const int dst_pitch = width;
+
+	const int src_depth = 8;
+	const int dst_depth = disp_size < 256 ? 8 : 16;
+	const int src_bytes = src_depth * width * height / 8;
+	const int dst_bytes = dst_depth * width * height / 8;
+
+	sgm::StereoSGM sgm(width, height, disp_size, src_depth, dst_depth, src_pitch, dst_pitch, sgm::EXECUTE_INOUT_CUDA2CUDA);
+
+	device_buffer d_disparity(dst_bytes);
+	cv::Mat disparity(height, width, dst_depth == 8 ? CV_8S : CV_16S), disparity_color;
+
+	const int invalid_disp = sgm.get_invalid_disparity();
+
+	std::cout << "max disparity    : " << disp_size << std::endl;
+	std::cout << "camera resolution: " << sl::toString(initParameters.camera_resolution) << " " << cv::Size(width, height) << std::endl;
+
+	while (1) {
+
+		if (zed.grab() == sl::ERROR_CODE::SUCCESS) {
+			zed.retrieveImage(d_zed_image_L, sl::VIEW::LEFT_GRAY, sl::MEM::GPU);
+			zed.retrieveImage(d_zed_image_R, sl::VIEW::RIGHT_GRAY, sl::MEM::GPU);
+		}
+		else {
+			continue;
+		}
+
+		const auto t1 = std::chrono::system_clock::now();
+
+		sgm.execute(d_zed_image_L.getPtr<uchar>(sl::MEM::GPU), d_zed_image_R.getPtr<uchar>(sl::MEM::GPU), d_disparity.data);
+		cudaDeviceSynchronize();
+
+		const auto t2 = std::chrono::system_clock::now();
+		const auto duration = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+		const double fps = 1e6 / duration;
+
+		d_disparity.download(disparity.data);
+
+		// draw results
+		colorize_disparity(disparity, disparity_color, disp_size, disparity == invalid_disp);
+		cv::putText(disparity_color, cv::format("sgm execution time: %4.1f[msec] %4.1f[FPS]",
+			1e-3 * duration, fps), cv::Point(50, 50), 2, 0.75, cv::Scalar(255, 255, 255));
+
+		cv::imshow("disparity", disparity_color);
+
+		const char c = cv::waitKey(1);
+		if (c == 27) // ESC
+			break;
+	}
+
+	return 0;
+}
@@ -0,0 +1,55 @@
+cmake_minimum_required(VERSION 3.18)
+
+set(LIBSGM_ROOT_DIR ${PROJECT_SOURCE_DIR})
+set(LIBSGM_INCLUDE_DIR ${LIBSGM_ROOT_DIR}/include)
+
+# create project
+set(PROJECT_NAME sgm)
+project(${PROJECT_NAME} LANGUAGES CXX CUDA)
+
+# dependent packages
+find_package(CUDAToolkit REQUIRED)
+
+if(BUILD_OPENCV_WRAPPER)
+	find_package(OpenCV REQUIRED core)
+endif()
+
+# library type
+set(SGM_LIB_TYPE STATIC)
+if(LIBSGM_SHARED)
+	set(SGM_LIB_TYPE SHARED)
+endif()
+
+# target configuration
+file(GLOB SRCS ./*.cpp ./*.cu ./*.h* ${LIBSGM_INCLUDE_DIR}/*.h*)
+add_library(${PROJECT_NAME} ${SGM_LIB_TYPE})
+target_sources(${PROJECT_NAME} PRIVATE ${SRCS})
+target_include_directories(${PROJECT_NAME} PRIVATE ${LIBSGM_INCLUDE_DIR} $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_INCLUDE_DIRS}>)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
+target_link_libraries(${PROJECT_NAME} PUBLIC CUDA::cudart $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_LIBS}>)
+set_target_properties(${PROJECT_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${LIBSGM_INCLUDE_DIR})
+
+target_compile_options(${PROJECT_NAME} PRIVATE
+$<$<COMPILE_LANG_AND_ID:CXX,GNU>:-Wall -O3>
+$<$<COMPILE_LANG_AND_ID:CXX,Clang>:-Wall -O3>
+$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/wd4819>
+$<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
+)
+
+install(
+	TARGETS ${PROJECT_NAME}
+	ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+	LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib
+	RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
+)
+
+install(
+	DIRECTORY ${LIBSGM_INCLUDE_DIR}
+	DESTINATION ${CMAKE_INSTALL_PREFIX}
+	FILES_MATCHING PATTERN "*.h" PATTERN "*.hpp"
+)
+
+install(
+	FILES ${LIBSGM_ROOT_DIR}/FindLibSGM.cmake
+	DESTINATION ${CMAKE_INSTALL_PREFIX}
+)
@@ -0,0 +1,212 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "types.h"
+#include "host_utility.h"
+
+namespace sgm
+{
+namespace
+{
+
+static constexpr int WINDOW_WIDTH  = 9;
+static constexpr int WINDOW_HEIGHT = 7;
+
+static constexpr int BLOCK_SIZE = 128;
+static constexpr int LINES_PER_BLOCK = 16;
+
+template <typename T>
+__global__ void census_transform_kernel(uint64_t* dest, const T* src, int width, int height, int pitch)
+{
+	using pixel_type = T;
+	using feature_type = uint64_t;
+
+	static const int SMEM_BUFFER_SIZE = WINDOW_HEIGHT + 1;
+
+	const int half_kw = WINDOW_WIDTH / 2;
+	const int half_kh = WINDOW_HEIGHT / 2;
+
+	__shared__ pixel_type smem_lines[SMEM_BUFFER_SIZE][BLOCK_SIZE];
+
+	const int tid = threadIdx.x;
+	const int x0 = blockIdx.x * (BLOCK_SIZE - WINDOW_WIDTH + 1) - half_kw;
+	const int y0 = blockIdx.y * LINES_PER_BLOCK;
+
+	for (int i = 0; i < WINDOW_HEIGHT; ++i) {
+		const int x = x0 + tid, y = y0 - half_kh + i;
+		pixel_type value = 0;
+		if (0 <= x && x < width && 0 <= y && y < height) {
+			value = src[x + y * pitch];
+		}
+		smem_lines[i][tid] = value;
+	}
+	__syncthreads();
+
+#pragma unroll
+	for (int i = 0; i < LINES_PER_BLOCK; ++i) {
+		if (i + 1 < LINES_PER_BLOCK) {
+			// Load to smem
+			const int x = x0 + tid, y = y0 + half_kh + i + 1;
+			pixel_type value = 0;
+			if (0 <= x && x < width && 0 <= y && y < height) {
+				value = src[x + y * pitch];
+			}
+			const int smem_x = tid;
+			const int smem_y = (WINDOW_HEIGHT + i) % SMEM_BUFFER_SIZE;
+			smem_lines[smem_y][smem_x] = value;
+		}
+
+		if (half_kw <= tid && tid < BLOCK_SIZE - half_kw) {
+			// Compute and store
+			const int x = x0 + tid, y = y0 + i;
+			if (half_kw <= x && x < width - half_kw && half_kh <= y && y < height - half_kh) {
+				const int smem_x = tid;
+				const int smem_y = (half_kh + i) % SMEM_BUFFER_SIZE;
+				const auto a = smem_lines[smem_y][smem_x];
+				feature_type f = 0;
+				for (int dy = -half_kh; dy <= half_kh; ++dy) {
+					for (int dx = -half_kw; dx <= half_kw; ++dx) {
+						if (dx != 0 && dy != 0) {
+							const int smem_y1 = (smem_y + dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
+							const int smem_x1 = smem_x + dx;
+							const auto b = smem_lines[smem_y1][smem_x1];
+							f = (f << 1) | (a > b);
+						}
+					}
+				}
+				dest[x + y * width] = f;
+			}
+		}
+		__syncthreads();
+	}
+}
+
+template <typename T>
+__global__ void symmetric_census_kernel(uint32_t* dest, const T* src, int width, int height, int pitch)
+{
+	using pixel_type = T;
+	using feature_type = uint32_t;
+
+	static const int SMEM_BUFFER_SIZE = WINDOW_HEIGHT + 1;
+
+	const int half_kw = WINDOW_WIDTH  / 2;
+	const int half_kh = WINDOW_HEIGHT / 2;
+
+	__shared__ pixel_type smem_lines[SMEM_BUFFER_SIZE][BLOCK_SIZE];
+
+	const int tid = threadIdx.x;
+	const int x0 = blockIdx.x * (BLOCK_SIZE - WINDOW_WIDTH + 1) - half_kw;
+	const int y0 = blockIdx.y * LINES_PER_BLOCK;
+
+	for(int i = 0; i < WINDOW_HEIGHT; ++i){
+		const int x = x0 + tid, y = y0 - half_kh + i;
+		pixel_type value = 0;
+		if(0 <= x && x < width && 0 <= y && y < height){
+			value = src[x + y * pitch];
+		}
+		smem_lines[i][tid] = value;
+	}
+	__syncthreads();
+
+#pragma unroll
+	for(int i = 0; i < LINES_PER_BLOCK; ++i){
+		if(i + 1 < LINES_PER_BLOCK){
+			// Load to smem
+			const int x = x0 + tid, y = y0 + half_kh + i + 1;
+			pixel_type value = 0;
+			if(0 <= x && x < width && 0 <= y && y < height){
+				value = src[x + y * pitch];
+			}
+			const int smem_x = tid;
+			const int smem_y = (WINDOW_HEIGHT + i) % SMEM_BUFFER_SIZE;
+			smem_lines[smem_y][smem_x] = value;
+		}
+
+		if(half_kw <= tid && tid < BLOCK_SIZE - half_kw){
+			// Compute and store
+			const int x = x0 + tid, y = y0 + i;
+			if(half_kw <= x && x < width - half_kw && half_kh <= y && y < height - half_kh){
+				const int smem_x = tid;
+				const int smem_y = (half_kh + i) % SMEM_BUFFER_SIZE;
+				feature_type f = 0;
+				for(int dy = -half_kh; dy < 0; ++dy){
+					const int smem_y1 = (smem_y + dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
+					const int smem_y2 = (smem_y - dy + SMEM_BUFFER_SIZE) % SMEM_BUFFER_SIZE;
+					for(int dx = -half_kw; dx <= half_kw; ++dx){
+						const int smem_x1 = smem_x + dx;
+						const int smem_x2 = smem_x - dx;
+						const auto a = smem_lines[smem_y1][smem_x1];
+						const auto b = smem_lines[smem_y2][smem_x2];
+						f = (f << 1) | (a > b);
+					}
+				}
+				for(int dx = -half_kw; dx < 0; ++dx){
+					const int smem_x1 = smem_x + dx;
+					const int smem_x2 = smem_x - dx;
+					const auto a = smem_lines[smem_y][smem_x1];
+					const auto b = smem_lines[smem_y][smem_x2];
+					f = (f << 1) | (a > b);
+				}
+				dest[x + y * width] = f;
+			}
+		}
+		__syncthreads();
+	}
+}
+
+} // namespace
+
+namespace details
+{
+
+void census_transform(const DeviceImage& src, DeviceImage& dst, CensusType type)
+{
+	const int w = src.cols;
+	const int h = src.rows;
+
+	const int w_per_block = BLOCK_SIZE - WINDOW_WIDTH + 1;
+	const int h_per_block = LINES_PER_BLOCK;
+	const dim3 gdim(divUp(w, w_per_block), divUp(h, h_per_block));
+	const dim3 bdim(BLOCK_SIZE);
+
+	dst.create(h, w, type == CensusType::CENSUS_9x7 ? SGM_64U : SGM_32U);
+
+	if (type == CensusType::CENSUS_9x7) {
+		if (src.type == SGM_8U)
+			census_transform_kernel<<<gdim, bdim>>>(dst.ptr<uint64_t>(), src.ptr<uint8_t>(), w, h, src.step);
+		else if (src.type == SGM_16U)
+			census_transform_kernel<<<gdim, bdim>>>(dst.ptr<uint64_t>(), src.ptr<uint16_t>(), w, h, src.step);
+		else
+			census_transform_kernel<<<gdim, bdim>>>(dst.ptr<uint64_t>(), src.ptr<uint32_t>(), w, h, src.step);
+	}
+	else if (type == CensusType::SYMMETRIC_CENSUS_9x7) {
+		if (src.type == SGM_8U)
+			symmetric_census_kernel<<<gdim, bdim>>>(dst.ptr<uint32_t>(), src.ptr<uint8_t>(), w, h, src.step);
+		else if (src.type == SGM_16U)
+			symmetric_census_kernel<<<gdim, bdim>>>(dst.ptr<uint32_t>(), src.ptr<uint16_t>(), w, h, src.step);
+		else
+			symmetric_census_kernel<<<gdim, bdim>>>(dst.ptr<uint32_t>(), src.ptr<uint32_t>(), w, h, src.step);
+	}
+
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,87 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "constants.h"
+#include "host_utility.h"
+
+namespace
+{
+
+template<typename SRC_T, typename DST_T>
+__global__ void check_consistency_kernel(DST_T* dispL, const DST_T* dispR, const SRC_T* srcL, int width, int height, int src_pitch, int dst_pitch, bool subpixel, int LR_max_diff)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+	if (x >= width || y >= height)
+		return;
+
+	// left-right consistency check, only on leftDisp, but could be done for rightDisp too
+
+	SRC_T mask = srcL[y * src_pitch + x];
+	DST_T org = dispL[y * dst_pitch + x];
+	int d = org;
+	if (subpixel) {
+		d >>= sgm::StereoSGM::SUBPIXEL_SHIFT;
+	}
+	const int k = x - d;
+	if (mask == 0 || org == sgm::INVALID_DISP || (k >= 0 && k < width && LR_max_diff >= 0 && abs(dispR[y * dst_pitch + k] - d) > LR_max_diff)) {
+		// masked or left-right inconsistent pixel -> invalid
+		dispL[y * dst_pitch + x] = static_cast<DST_T>(sgm::INVALID_DISP);
+	}
+}
+
+} // namespace
+
+namespace sgm
+{
+namespace details
+{
+
+void check_consistency(DeviceImage& dispL, const DeviceImage& dispR, const DeviceImage& srcL, bool subpixel, int LR_max_diff)
+{
+	SGM_ASSERT(dispL.type == SGM_16U && dispR.type == SGM_16U, "");
+
+	const int w = srcL.cols;
+	const int h = srcL.rows;
+
+	const dim3 block(16, 16);
+	const dim3 grid(divUp(w, block.x), divUp(h, block.y));
+
+	if (srcL.type == SGM_8U) {
+		using SRC_T = uint8_t;
+		check_consistency_kernel<SRC_T><<<grid, block>>>(dispL.ptr<uint16_t>(), dispR.ptr<uint16_t>(),
+			srcL.ptr<SRC_T>(), w, h, srcL.step, dispL.step, subpixel, LR_max_diff);
+	}
+	else if (srcL.type == SGM_16U) {
+		using SRC_T = uint16_t;
+		check_consistency_kernel<SRC_T><<<grid, block>>>(dispL.ptr<uint16_t>(), dispR.ptr<uint16_t>(),
+			srcL.ptr<SRC_T>(), w, h, srcL.step, dispL.step, subpixel, LR_max_diff);
+	}
+	else {
+		using SRC_T = uint32_t;
+		check_consistency_kernel<SRC_T><<<grid, block>>>(dispL.ptr<uint16_t>(), dispR.ptr<uint16_t>(),
+			srcL.ptr<SRC_T>(), w, h, srcL.step, dispL.step, subpixel, LR_max_diff);
+	}
+
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,29 @@
+/*Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __CONSTANTS_H__
+#define __CONSTANTS_H__
+
+#include "types.h"
+
+namespace sgm
+{
+
+static constexpr unsigned int WARP_SIZE = 32u;
+static constexpr output_type INVALID_DISP = static_cast<output_type>(-1);
+
+} // namespace sgm
+
+#endif // !__CONSTANTS_H__
@@ -0,0 +1,73 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "constants.h"
+#include "host_utility.h"
+
+namespace
+{
+
+__global__ void correct_disparity_range_kernel(uint16_t* d_disp, int width, int height, int pitch, int min_disp_scaled, int invalid_disp_scaled)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if (x >= width || y >= height) {
+		return;
+	}
+
+	uint16_t d = d_disp[y * pitch + x];
+	if (d == sgm::INVALID_DISP) {
+		d = invalid_disp_scaled;
+	} else {
+		d += min_disp_scaled;
+	}
+	d_disp[y * pitch + x] = d;
+}
+
+} // namespace
+
+namespace sgm
+{
+namespace details
+{
+
+void correct_disparity_range(DeviceImage& disp, bool subpixel, int min_disp)
+{
+	if (!subpixel && min_disp == 0) {
+		return;
+	}
+
+	const int w = disp.cols;
+	const int h = disp.rows;
+	constexpr int SIZE = 16;
+	const dim3 blocks(divUp(w, SIZE), divUp(h, SIZE));
+	const dim3 threads(SIZE, SIZE);
+
+	const int scale = subpixel ? StereoSGM::SUBPIXEL_SCALE : 1;
+	const int     min_disp_scaled =  min_disp      * scale;
+	const int invalid_disp_scaled = (min_disp - 1) * scale;
+
+	correct_disparity_range_kernel<<<blocks, threads>>>(disp.ptr<uint16_t>(), w, h, disp.step, min_disp_scaled, invalid_disp_scaled);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,668 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "device_utility.h"
+#include "host_utility.h"
+
+#if CUDA_VERSION >= 9000
+#define SHFL_UP(mask, var, delta, w) __shfl_up_sync((mask), (var), (delta), (w))
+#define SHFL_DOWN(mask, var, delta, w) __shfl_down_sync((mask), (var), (delta), (w))
+#else
+#define SHFL_UP(mask, var, delta, width) __shfl_up((var), (delta), (width))
+#define SHFL_DOWN(mask, var, delta, width) __shfl_down((var), (delta), (width))
+#endif
+
+namespace sgm
+{
+
+using COST_TYPE = cost_type;
+
+namespace cost_aggregation
+{
+
+template <typename T> __device__ inline int popcnt(T x) { return 0; }
+template <> __device__ inline int popcnt(uint32_t x) { return __popc(x); }
+template <> __device__ inline int popcnt(uint64_t x) { return __popcll(x); }
+
+template <unsigned int DP_BLOCK_SIZE, unsigned int SUBGROUP_SIZE>
+struct DynamicProgramming
+{
+	static_assert(DP_BLOCK_SIZE >= 2, "DP_BLOCK_SIZE must be greater than or equal to 2");
+	static_assert((SUBGROUP_SIZE & (SUBGROUP_SIZE - 1)) == 0, "SUBGROUP_SIZE must be a power of 2");
+
+	uint32_t last_min;
+	uint32_t dp[DP_BLOCK_SIZE];
+
+	__device__ DynamicProgramming() : last_min(0)
+	{
+		for (unsigned int i = 0; i < DP_BLOCK_SIZE; ++i) { dp[i] = 0; }
+	}
+
+	__device__ void update(uint32_t *local_costs, uint32_t p1, uint32_t p2, uint32_t mask)
+	{
+		const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
+
+		const auto dp0 = dp[0];
+		uint32_t lazy_out = 0, local_min = 0;
+		{
+			const unsigned int k = 0;
+			const uint32_t prev = SHFL_UP(mask, dp[DP_BLOCK_SIZE - 1], 1, WARP_SIZE);
+			uint32_t out = min(dp[k] - last_min, p2);
+			if (lane_id != 0) { out = min(out, prev - last_min + p1); }
+			out = min(out, dp[k + 1] - last_min + p1);
+			lazy_out = local_min = out + local_costs[k];
+		}
+		for (unsigned int k = 1; k + 1 < DP_BLOCK_SIZE; ++k) {
+			uint32_t out = min(dp[k] - last_min, p2);
+			out = min(out, dp[k - 1] - last_min + p1);
+			out = min(out, dp[k + 1] - last_min + p1);
+			dp[k - 1] = lazy_out;
+			lazy_out = out + local_costs[k];
+			local_min = min(local_min, lazy_out);
+		}
+		{
+			const unsigned int k = DP_BLOCK_SIZE - 1;
+			const uint32_t next = SHFL_DOWN(mask, dp0, 1, WARP_SIZE);
+			uint32_t out = min(dp[k] - last_min, p2);
+			out = min(out, dp[k - 1] - last_min + p1);
+			if (lane_id + 1 != SUBGROUP_SIZE) {
+				out = min(out, next - last_min + p1);
+			}
+			dp[k - 1] = lazy_out;
+			dp[k] = out + local_costs[k];
+			local_min = min(local_min, dp[k]);
+		}
+		last_min = subgroup_min<SUBGROUP_SIZE>(local_min, mask);
+	}
+};
+
+template <unsigned int SIZE>
+__device__ unsigned int generate_mask()
+{
+	static_assert(SIZE <= 32, "SIZE must be less than or equal to 32");
+	return static_cast<unsigned int>((1ull << SIZE) - 1u);
+}
+
+template <typename CENSUS_T>
+__device__ inline CENSUS_T load_census_with_check(const CENSUS_T* ptr, int x, int w)
+{
+	return x >= 0 && x < w ? __ldg(ptr + x) : 0;
+}
+
+namespace vertical
+{
+
+static constexpr unsigned int DP_BLOCK_SIZE = 16u;
+static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
+
+template <typename CENSUS_TYPE, int DIRECTION, unsigned int MAX_DISPARITY>
+__global__ void aggregate_vertical_path_kernel(
+	uint8_t *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK;
+	static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE;
+
+	static_assert(DIRECTION == 1 || DIRECTION == -1, "");
+	if (width == 0 || height == 0) {
+		return;
+	}
+
+	__shared__ CENSUS_TYPE right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS + 1];
+	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp;
+
+	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
+	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
+	const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
+	const unsigned int shfl_mask =
+		generate_mask<SUBGROUP_SIZE>() << (group_id * SUBGROUP_SIZE);
+
+	const unsigned int x =
+		blockIdx.x * PATHS_PER_BLOCK +
+		warp_id * PATHS_PER_WARP +
+		group_id;
+	const unsigned int right_x0 = blockIdx.x * PATHS_PER_BLOCK;
+	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
+
+	const unsigned int right0_addr =
+		(right_x0 + PATHS_PER_BLOCK - 1) - x + dp_offset;
+	const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE;
+	const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE;
+
+	for (unsigned int iter = 0; iter < height; ++iter) {
+		const unsigned int y = (DIRECTION > 0 ? iter : height - 1 - iter);
+		// Load left to register
+		CENSUS_TYPE left_value;
+		if (x < width) {
+			left_value = left[x + y * width];
+		}
+		// Load right to smem
+		for (unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE) {
+			const unsigned int i = i0 + threadIdx.x;
+			if (i < RIGHT_BUFFER_SIZE) {
+				const int right_x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i - min_disp);
+				const CENSUS_TYPE right_value = load_census_with_check(&right[y * width], right_x, width);
+				const unsigned int lo = i % DP_BLOCK_SIZE;
+				const unsigned int hi = i / DP_BLOCK_SIZE;
+				right_buffer[lo][hi] = right_value;
+				if (hi > 0) {
+					right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value;
+				}
+			}
+		}
+		__syncthreads();
+		// Compute
+		if (x < width) {
+			CENSUS_TYPE right_values[DP_BLOCK_SIZE];
+			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
+				right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi];
+			}
+			uint32_t local_costs[DP_BLOCK_SIZE];
+			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
+				local_costs[j] = popcnt(left_value ^ right_values[j]);
+			}
+			dp.update(local_costs, p1, p2, shfl_mask);
+			store_uint8_vector<DP_BLOCK_SIZE>(
+				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
+				dp.dp);
+		}
+		__syncthreads();
+	}
+}
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_up2down(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_vertical_path_kernel<CENSUS_TYPE, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_down2up(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_vertical_path_kernel<CENSUS_TYPE, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace vertical
+
+namespace horizontal
+{
+
+static constexpr unsigned int DP_BLOCK_SIZE = 8u;
+static constexpr unsigned int DP_BLOCKS_PER_THREAD = 1u;
+
+static constexpr unsigned int WARPS_PER_BLOCK = 4u;
+static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK;
+
+template <typename CENSUS_TYPE, int DIRECTION, unsigned int MAX_DISPARITY>
+__global__ void aggregate_horizontal_path_kernel(
+	uint8_t *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int SUBGROUPS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_WARP =
+		WARP_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_BLOCK =
+		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+
+	static_assert(DIRECTION == 1 || DIRECTION == -1, "");
+	if (width == 0 || height == 0) {
+		return;
+	}
+
+	CENSUS_TYPE right_buffer[DP_BLOCKS_PER_THREAD][DP_BLOCK_SIZE];
+	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp[DP_BLOCKS_PER_THREAD];
+
+	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
+	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
+	const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
+	const unsigned int shfl_mask =
+		generate_mask<SUBGROUP_SIZE>() << (group_id * SUBGROUP_SIZE);
+
+	const unsigned int y0 =
+		PATHS_PER_BLOCK * blockIdx.x +
+		PATHS_PER_WARP * warp_id +
+		group_id;
+	const unsigned int feature_step = SUBGROUPS_PER_WARP * width;
+	const unsigned int dest_step = SUBGROUPS_PER_WARP * MAX_DISPARITY * width;
+	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
+	left += y0 * width;
+	right += y0 * width;
+	dest += y0 * MAX_DISPARITY * width;
+
+	if (y0 >= height) {
+		return;
+	}
+
+	// initialize census buffer
+	{
+		const int x0 = (DIRECTION > 0 ? -1 : width) - (min_disp + static_cast<int>(dp_offset));
+		for (int dy = 0; dy < DP_BLOCKS_PER_THREAD; ++dy)
+			for (int dx = 0; dx < DP_BLOCK_SIZE; ++dx)
+				right_buffer[dy][dx] = load_census_with_check(&right[dy * feature_step], x0 - dx, width);
+	}
+
+	int x0 = (DIRECTION > 0) ? 0 : static_cast<int>((width - 1) & ~(DP_BLOCK_SIZE - 1));
+	for (unsigned int iter = 0; iter < width; iter += DP_BLOCK_SIZE) {
+		for (unsigned int i = 0; i < DP_BLOCK_SIZE; ++i) {
+			const unsigned int x = x0 + (DIRECTION > 0 ? i : (DP_BLOCK_SIZE - 1 - i));
+			if (x >= width) {
+				continue;
+			}
+			for (unsigned int j = 0; j < DP_BLOCKS_PER_THREAD; ++j) {
+				const unsigned int y = y0 + j * SUBGROUPS_PER_WARP;
+				if (y >= height) {
+					continue;
+				}
+				const CENSUS_TYPE left_value = __ldg(&left[j * feature_step + x]);
+				if (DIRECTION > 0) {
+					const CENSUS_TYPE t = right_buffer[j][DP_BLOCK_SIZE - 1];
+					for (unsigned int k = DP_BLOCK_SIZE - 1; k > 0; --k) {
+						right_buffer[j][k] = right_buffer[j][k - 1];
+					}
+					right_buffer[j][0] = SHFL_UP(shfl_mask, t, 1, SUBGROUP_SIZE);
+					if (lane_id == 0) {
+						right_buffer[j][0] = load_census_with_check(&right[j * feature_step], x - min_disp, width);
+					}
+				}
+				else {
+					const CENSUS_TYPE t = right_buffer[j][0];
+					for (unsigned int k = 1; k < DP_BLOCK_SIZE; ++k) {
+						right_buffer[j][k - 1] = right_buffer[j][k];
+					}
+					right_buffer[j][DP_BLOCK_SIZE - 1] = SHFL_DOWN(shfl_mask, t, 1, SUBGROUP_SIZE);
+					if (lane_id + 1 == SUBGROUP_SIZE) {
+						right_buffer[j][DP_BLOCK_SIZE - 1] = load_census_with_check(&right[j * feature_step], x - (min_disp + dp_offset + DP_BLOCK_SIZE - 1), width);
+					}
+				}
+				uint32_t local_costs[DP_BLOCK_SIZE];
+				for (unsigned int k = 0; k < DP_BLOCK_SIZE; ++k) {
+					local_costs[k] = popcnt(left_value ^ right_buffer[j][k]);
+				}
+				dp[j].update(local_costs, p1, p2, shfl_mask);
+				store_uint8_vector<DP_BLOCK_SIZE>(
+					&dest[j * dest_step + x * MAX_DISPARITY + dp_offset],
+					dp[j].dp);
+			}
+		}
+		x0 += static_cast<int>(DP_BLOCK_SIZE) * DIRECTION;
+	}
+}
+
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_left2right(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK =
+		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+
+	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_horizontal_path_kernel<CENSUS_TYPE, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_right2left(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK =
+		BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
+
+	const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_horizontal_path_kernel<CENSUS_TYPE, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace horizontal
+
+namespace oblique
+{
+
+static constexpr unsigned int DP_BLOCK_SIZE = 16u;
+static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
+
+template <typename CENSUS_TYPE, int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY>
+__global__ void aggregate_oblique_path_kernel(
+	uint8_t *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_WARP = WARP_SIZE / SUBGROUP_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	static const unsigned int RIGHT_BUFFER_SIZE = MAX_DISPARITY + PATHS_PER_BLOCK;
+	static const unsigned int RIGHT_BUFFER_ROWS = RIGHT_BUFFER_SIZE / DP_BLOCK_SIZE;
+
+	static_assert(X_DIRECTION == 1 || X_DIRECTION == -1, "");
+	static_assert(Y_DIRECTION == 1 || Y_DIRECTION == -1, "");
+	if (width == 0 || height == 0) {
+		return;
+	}
+
+	__shared__ CENSUS_TYPE right_buffer[2 * DP_BLOCK_SIZE][RIGHT_BUFFER_ROWS];
+	DynamicProgramming<DP_BLOCK_SIZE, SUBGROUP_SIZE> dp;
+
+	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
+	const unsigned int group_id = threadIdx.x % WARP_SIZE / SUBGROUP_SIZE;
+	const unsigned int lane_id = threadIdx.x % SUBGROUP_SIZE;
+	const unsigned int shfl_mask =
+		generate_mask<SUBGROUP_SIZE>() << (group_id * SUBGROUP_SIZE);
+
+	const int x0 =
+		blockIdx.x * PATHS_PER_BLOCK +
+		warp_id * PATHS_PER_WARP +
+		group_id +
+		(X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0);
+	const int right_x00 =
+		blockIdx.x * PATHS_PER_BLOCK +
+		(X_DIRECTION > 0 ? -static_cast<int>(height - 1) : 0);
+	const unsigned int dp_offset = lane_id * DP_BLOCK_SIZE;
+
+	const unsigned int right0_addr =
+		static_cast<unsigned int>(right_x00 + PATHS_PER_BLOCK - 1 - x0) + dp_offset;
+	const unsigned int right0_addr_lo = right0_addr % DP_BLOCK_SIZE;
+	const unsigned int right0_addr_hi = right0_addr / DP_BLOCK_SIZE;
+
+	for (unsigned int iter = 0; iter < height; ++iter) {
+		const int y = static_cast<int>(Y_DIRECTION > 0 ? iter : height - 1 - iter);
+		const int x = x0 + static_cast<int>(iter) * X_DIRECTION;
+		const int right_x0 = right_x00 + static_cast<int>(iter) * X_DIRECTION;
+		// Load right to smem
+		for (unsigned int i0 = 0; i0 < RIGHT_BUFFER_SIZE; i0 += BLOCK_SIZE) {
+			const unsigned int i = i0 + threadIdx.x;
+			if (i < RIGHT_BUFFER_SIZE) {
+				const int right_x = static_cast<int>(right_x0 + PATHS_PER_BLOCK - 1 - i - min_disp);
+				const CENSUS_TYPE right_value = load_census_with_check(&right[y * width], right_x, width);
+				const unsigned int lo = i % DP_BLOCK_SIZE;
+				const unsigned int hi = i / DP_BLOCK_SIZE;
+				right_buffer[lo][hi] = right_value;
+				if (hi > 0) {
+					right_buffer[lo + DP_BLOCK_SIZE][hi - 1] = right_value;
+				}
+			}
+		}
+		__syncthreads();
+		// Compute
+		if (0 <= x && x < static_cast<int>(width)) {
+			const CENSUS_TYPE left_value = __ldg(&left[x + y * width]);
+			CENSUS_TYPE right_values[DP_BLOCK_SIZE];
+			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
+				right_values[j] = right_buffer[right0_addr_lo + j][right0_addr_hi];
+			}
+			uint32_t local_costs[DP_BLOCK_SIZE];
+			for (unsigned int j = 0; j < DP_BLOCK_SIZE; ++j) {
+				local_costs[j] = popcnt(left_value ^ right_values[j]);
+			}
+			dp.update(local_costs, p1, p2, shfl_mask);
+			store_uint8_vector<DP_BLOCK_SIZE>(
+				&dest[dp_offset + x * MAX_DISPARITY + y * MAX_DISPARITY * width],
+				dp.dp);
+		}
+		__syncthreads();
+	}
+}
+
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_upleft2downright(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<CENSUS_TYPE, 1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_upright2downleft(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<CENSUS_TYPE, -1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_downright2upleft(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<CENSUS_TYPE, -1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename CENSUS_TYPE, unsigned int MAX_DISPARITY>
+void aggregate_downleft2upright(
+	COST_TYPE *dest,
+	const CENSUS_TYPE *left,
+	const CENSUS_TYPE *right,
+	int width,
+	int height,
+	unsigned int p1,
+	unsigned int p2,
+	int min_disp,
+	cudaStream_t stream)
+{
+	static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
+	static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
+
+	const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
+	const int bdim = BLOCK_SIZE;
+	aggregate_oblique_path_kernel<CENSUS_TYPE, 1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
+		dest, left, right, width, height, p1, p2, min_disp);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace oblique
+
+} // namespace cost_aggregation
+
+namespace details
+{
+
+template <typename CENSUS_TYPE, int MAX_DISPARITY>
+void cost_aggregation_(const DeviceImage& srcL, const DeviceImage& srcR, DeviceImage& dst,
+	int P1, int P2, PathType path_type, int min_disp)
+{
+	const int width = srcL.cols;
+	const int height = srcL.rows;
+	const int num_paths = path_type == PathType::SCAN_4PATH ? 4 : 8;
+
+	dst.create(num_paths, height * width * MAX_DISPARITY, SGM_8U);
+
+	const CENSUS_TYPE* left = srcL.ptr<CENSUS_TYPE>();
+	const CENSUS_TYPE* right = srcR.ptr<CENSUS_TYPE>();
+
+	cudaStream_t streams[8];
+	for (int i = 0; i < num_paths; i++)
+		cudaStreamCreate(&streams[i]);
+
+	cost_aggregation::vertical::aggregate_up2down<CENSUS_TYPE, MAX_DISPARITY>(
+		dst.ptr<COST_TYPE>(0), left, right, width, height, P1, P2, min_disp, streams[0]);
+	cost_aggregation::vertical::aggregate_down2up<CENSUS_TYPE, MAX_DISPARITY>(
+		dst.ptr<COST_TYPE>(1), left, right, width, height, P1, P2, min_disp, streams[1]);
+	cost_aggregation::horizontal::aggregate_left2right<CENSUS_TYPE, MAX_DISPARITY>(
+		dst.ptr<COST_TYPE>(2), left, right, width, height, P1, P2, min_disp, streams[2]);
+	cost_aggregation::horizontal::aggregate_right2left<CENSUS_TYPE, MAX_DISPARITY>(
+		dst.ptr<COST_TYPE>(3), left, right, width, height, P1, P2, min_disp, streams[3]);
+
+	if (path_type == PathType::SCAN_8PATH) {
+		cost_aggregation::oblique::aggregate_upleft2downright<CENSUS_TYPE, MAX_DISPARITY>(
+			dst.ptr<COST_TYPE>(4), left, right, width, height, P1, P2, min_disp, streams[4]);
+		cost_aggregation::oblique::aggregate_upright2downleft<CENSUS_TYPE, MAX_DISPARITY>(
+			dst.ptr<COST_TYPE>(5), left, right, width, height, P1, P2, min_disp, streams[5]);
+		cost_aggregation::oblique::aggregate_downright2upleft<CENSUS_TYPE, MAX_DISPARITY>(
+			dst.ptr<COST_TYPE>(6), left, right, width, height, P1, P2, min_disp, streams[6]);
+		cost_aggregation::oblique::aggregate_downleft2upright<CENSUS_TYPE, MAX_DISPARITY>(
+			dst.ptr<COST_TYPE>(7), left, right, width, height, P1, P2, min_disp, streams[7]);
+	}
+
+	for (int i = 0; i < num_paths; i++)
+		cudaStreamSynchronize(streams[i]);
+	for (int i = 0; i < num_paths; i++)
+		cudaStreamDestroy(streams[i]);
+}
+
+void cost_aggregation(const DeviceImage& srcL, const DeviceImage& srcR, DeviceImage& dst,
+	int disp_size, int P1, int P2, PathType path_type, int min_disp)
+{
+	SGM_ASSERT(srcL.type == srcR.type, "left and right image type must be same.");
+
+	if (srcL.type == SGM_32U) {
+		if (disp_size == 64) {
+			cost_aggregation_<uint32_t, 64>(srcL, srcR, dst, P1, P2, path_type, min_disp);
+		}
+		else if (disp_size == 128) {
+			cost_aggregation_<uint32_t, 128>(srcL, srcR, dst, P1, P2, path_type, min_disp);
+		}
+		else if (disp_size == 256) {
+			cost_aggregation_<uint32_t, 256>(srcL, srcR, dst, P1, P2, path_type, min_disp);
+		}
+	}
+	else if (srcL.type == SGM_64U) {
+		if (disp_size == 64) {
+			cost_aggregation_<uint64_t, 64>(srcL, srcR, dst, P1, P2, path_type, min_disp);
+		}
+		else if (disp_size == 128) {
+			cost_aggregation_<uint64_t, 128>(srcL, srcR, dst, P1, P2, path_type, min_disp);
+		}
+		else if (disp_size == 256) {
+			cost_aggregation_<uint64_t, 256>(srcL, srcR, dst, P1, P2, path_type, min_disp);
+		}
+	}
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,76 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "host_utility.h"
+
+namespace
+{
+
+__global__ void cast_16bit_8bit_array_kernel(const uint16_t* arr16bits, uint8_t* arr8bits, int num_elements)
+{
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < num_elements)
+		arr8bits[i] = static_cast<uint8_t>(arr16bits[i]);
+}
+
+__global__ void cast_8bit_16bit_array_kernel(const uint8_t* arr8bits, uint16_t* arr16bits, int num_elements)
+{
+	const int i = blockIdx.x * blockDim.x + threadIdx.x;
+	if (i < num_elements)
+		arr16bits[i] = static_cast<uint16_t>(arr8bits[i]);
+}
+
+} // namespace
+
+namespace sgm
+{
+namespace details
+{
+
+void cast_16bit_to_8bit(const DeviceImage& src, DeviceImage& dst)
+{
+	const int w = src.cols;
+	const int h = src.rows;
+	dst.create(h, w, SGM_8U, src.step);
+
+	const int num_elements = h * src.step;
+	const int block = 1024;
+	const int grid = divUp(num_elements, block);
+
+	cast_16bit_8bit_array_kernel<<<grid, block>>>(src.ptr<uint16_t>(), dst.ptr<uint8_t>(), num_elements);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+void cast_8bit_to_16bit(const DeviceImage& src, DeviceImage& dst)
+{
+	const int w = src.cols;
+	const int h = src.rows;
+	dst.create(h, w, SGM_16U, src.step);
+
+	const int num_elements = h * src.step;
+	const int block = 1024;
+	const int grid = divUp(num_elements, block);
+
+	cast_8bit_16bit_array_kernel<<<grid, block>>>(src.ptr<uint8_t>(), dst.ptr<uint16_t>(), num_elements);
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,110 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "device_allocator.h"
+
+#include <cuda_runtime.h>
+
+#include "host_utility.h"
+
+namespace sgm
+{
+
+DeviceAllocator::DeviceAllocator() : data_(nullptr), ref_count_(nullptr), capacity_(0)
+{
+}
+
+DeviceAllocator::DeviceAllocator(const DeviceAllocator& other)
+{
+	copy_construct_from(other);
+}
+
+DeviceAllocator::DeviceAllocator(DeviceAllocator&& right)
+{
+	move_construct_from(std::move(right));
+}
+
+DeviceAllocator::~DeviceAllocator()
+{
+	release();
+}
+
+void* DeviceAllocator::allocate(size_t size)
+{
+	if (size > capacity_)
+	{
+		release();
+		CUDA_CHECK(cudaMalloc(&data_, size));
+		ref_count_ = new int(1);
+		capacity_ = size;
+	}
+	return data_;
+}
+
+void DeviceAllocator::assign(void* data, size_t size)
+{
+	release();
+	data_ = data;
+	capacity_ = size;
+}
+
+void DeviceAllocator::release()
+{
+	if (ref_count_ && --(*ref_count_) == 0)
+	{
+		CUDA_CHECK(cudaFree(data_));
+		delete ref_count_;
+	}
+
+	data_ = ref_count_ = nullptr;
+	capacity_ = 0;
+}
+
+DeviceAllocator& DeviceAllocator::operator=(const DeviceAllocator& other)
+{
+	release();
+	copy_construct_from(other);
+	return *this;
+}
+
+DeviceAllocator& DeviceAllocator::operator=(DeviceAllocator&& right)
+{
+	release();
+	move_construct_from(std::move(right));
+	return *this;
+}
+
+void DeviceAllocator::copy_construct_from(const DeviceAllocator& other)
+{
+	data_ = other.data_;
+	ref_count_ = other.ref_count_;
+	capacity_ = other.capacity_;
+
+	if (ref_count_)
+		(*ref_count_)++;
+}
+
+void DeviceAllocator::move_construct_from(DeviceAllocator&& right)
+{
+	data_ = right.data_;
+	ref_count_ = right.ref_count_;
+	capacity_ = right.capacity_;
+
+	right.data_ = right.ref_count_ = nullptr;
+	right.capacity_ = 0;
+}
+
+} // namespace sgm
@@ -0,0 +1,52 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __DEVICE_ALLOCATOR_H__
+#define __DEVICE_ALLOCATOR_H__
+
+#include <cstddef>
+
+namespace sgm
+{
+
+class DeviceAllocator
+{
+public:
+
+	DeviceAllocator();
+	DeviceAllocator(const DeviceAllocator& other);
+	DeviceAllocator(DeviceAllocator&& right);
+	~DeviceAllocator();
+	void* allocate(size_t size);
+	void assign(void* data, size_t size);
+	void release();
+
+	DeviceAllocator& operator=(const DeviceAllocator& other);
+	DeviceAllocator& operator=(DeviceAllocator&& right);
+
+private:
+
+	void copy_construct_from(const DeviceAllocator& other);
+	void move_construct_from(DeviceAllocator&& right);
+
+	void* data_;
+	int* ref_count_;
+	size_t capacity_;
+};
+
+} // namespace sgm
+
+#endif // !__DEVICE_ALLOCATOR_H__
@@ -0,0 +1,93 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "device_image.h"
+
+#include <cuda_runtime.h>
+
+#include "host_utility.h"
+
+namespace sgm
+{
+
+static size_t elemSize(ImageType type)
+{
+	if (type == SGM_8U)
+		return 1;
+	if (type == SGM_16U)
+		return 2;
+	if (type == SGM_32U)
+		return 4;
+	if (type == SGM_64U)
+		return 8;
+	return 0;
+}
+
+DeviceImage::DeviceImage() : data(nullptr), rows(0), cols(0), step(0), type(SGM_8U)
+{
+}
+
+DeviceImage::DeviceImage(int rows, int cols, ImageType type, int step)
+{
+	create(rows, cols, type, step);
+}
+
+DeviceImage::DeviceImage(void* data, int rows, int cols, ImageType type, int step)
+{
+	create(data, rows, cols, type, step);
+}
+
+void DeviceImage::create(int _rows, int _cols, ImageType _type, int _step)
+{
+	if (_step < 0)
+		_step = _cols;
+
+	data = allocator_.allocate(elemSize(_type) * _rows * _step);
+	rows = _rows;
+	cols = _cols;
+	step = _step;
+	type = _type;
+}
+
+void DeviceImage::create(void* _data, int _rows, int _cols, ImageType _type, int _step)
+{
+	if (_step < 0)
+		_step = _cols;
+
+	allocator_.assign(_data, elemSize(_type) * _rows * _step);
+	data = _data;
+	rows = _rows;
+	cols = _cols;
+	step = _step;
+	type = _type;
+}
+
+void DeviceImage::upload(const void* _data)
+{
+	CUDA_CHECK(cudaMemcpy(data, _data, elemSize(type) * rows * step, cudaMemcpyHostToDevice));
+}
+
+void DeviceImage::download(void* _data) const
+{
+	CUDA_CHECK(cudaMemcpy(_data, data, elemSize(type) * rows * step, cudaMemcpyDeviceToHost));
+}
+
+void DeviceImage::fill_zero()
+{
+	CUDA_CHECK(cudaMemset(data, 0, elemSize(type) * rows * step));
+}
+
+} // namespace sgm
@@ -0,0 +1,62 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __DEVICE_IMAGE_H__
+#define __DEVICE_IMAGE_H__
+
+#include "device_allocator.h"
+
+namespace sgm
+{
+
+enum ImageType
+{
+	SGM_8U,
+	SGM_16U,
+	SGM_32U,
+	SGM_64U,
+};
+
+class DeviceImage
+{
+public:
+
+	DeviceImage();
+	DeviceImage(int rows, int cols, ImageType type, int step = -1);
+	DeviceImage(void* data, int rows, int cols, ImageType type, int step = -1);
+
+	void create(int rows, int cols, ImageType type, int step = -1);
+	void create(void* data, int rows, int cols, ImageType type, int step = -1);
+
+	void upload(const void* data);
+	void download(void* data) const;
+	void fill_zero();
+
+	template <typename T> T* ptr(int y = 0) { return (T*)data + y * (size_t)step; }
+	template <typename T> const T* ptr(int y = 0) const { return (T*)data + y * (size_t)step; }
+
+	void* data;
+	int rows, cols, step;
+	ImageType type;
+
+private:
+
+	DeviceAllocator allocator_;
+};
+
+} // namespace sgm
+
+#endif // !__DEVICE_IMAGE_H__
@@ -0,0 +1,283 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __DEVICE_UTILITY_H__
+#define __DEVICE_UTILITY_H__
+
+#include <cuda.h>
+
+#include "types.h"
+#include "constants.h"
+
+namespace sgm
+{
+namespace detail
+{
+
+template <typename T, unsigned int GROUP_SIZE, unsigned int STEP>
+struct subgroup_min_impl
+{
+	static __device__ T call(T x, uint32_t mask)
+	{
+#if CUDA_VERSION >= 9000
+		x = min(x, __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE));
+#else
+		x = min(x, __shfl_xor(x, STEP / 2, GROUP_SIZE));
+#endif
+		return subgroup_min_impl<T, GROUP_SIZE, STEP / 2>::call(x, mask);
+	}
+};
+
+template <typename T, unsigned int GROUP_SIZE>
+struct subgroup_min_impl<T, GROUP_SIZE, 1u>
+{
+	static __device__ T call(T x, uint32_t)
+	{
+		return x;
+	}
+};
+
+template <unsigned int GROUP_SIZE, unsigned int STEP>
+struct subgroup_and_impl
+{
+	static __device__ bool call(bool x, uint32_t mask)
+	{
+#if CUDA_VERSION >= 9000
+		x &= __shfl_xor_sync(mask, x, STEP / 2, GROUP_SIZE);
+#else
+		x &= __shfl_xor(x, STEP / 2, GROUP_SIZE);
+#endif
+		return subgroup_and_impl<GROUP_SIZE, STEP / 2>::call(x, mask);
+	}
+};
+template <unsigned int GROUP_SIZE>
+struct subgroup_and_impl<GROUP_SIZE, 1u>
+{
+	static __device__ bool call(bool x, uint32_t)
+	{
+		return x;
+	}
+};
+
+} // namespace detail
+
+template <unsigned int GROUP_SIZE, typename T>
+__device__ inline T subgroup_min(T x, uint32_t mask)
+{
+	return detail::subgroup_min_impl<T, GROUP_SIZE, GROUP_SIZE>::call(x, mask);
+}
+
+template <unsigned int GROUP_SIZE>
+__device__ inline bool subgroup_and(bool x, uint32_t mask)
+{
+	return detail::subgroup_and_impl<GROUP_SIZE, GROUP_SIZE>::call(x, mask);
+}
+
+template <typename T, typename S>
+__device__ inline T load_as(const S *p)
+{
+	return *reinterpret_cast<const T *>(p);
+}
+
+template <typename T, typename S>
+__device__ inline void store_as(S *p, const T& x)
+{
+	*reinterpret_cast<T *>(p) = x;
+}
+
+template <typename T>
+__device__ inline uint32_t pack_uint8x4(T x, T y, T z, T w)
+{
+	uchar4 uint8x4;
+	uint8x4.x = static_cast<uint8_t>(x);
+	uint8x4.y = static_cast<uint8_t>(y);
+	uint8x4.z = static_cast<uint8_t>(z);
+	uint8x4.w = static_cast<uint8_t>(w);
+	return load_as<uint32_t>(&uint8x4);
+}
+
+
+template <unsigned int N>
+__device__ inline void load_uint8_vector(uint32_t *dest, const uint8_t *ptr);
+
+template <>
+__device__ inline void load_uint8_vector<1u>(uint32_t *dest, const uint8_t *ptr)
+{
+	dest[0] = static_cast<uint32_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void load_uint8_vector<2u>(uint32_t *dest, const uint8_t *ptr)
+{
+	const auto uint8x2 = load_as<uchar2>(ptr);
+	dest[0] = uint8x2.x; dest[1] = uint8x2.y;
+}
+
+template <>
+__device__ inline void load_uint8_vector<4u>(uint32_t *dest, const uint8_t *ptr)
+{
+	const auto uint8x4 = load_as<uchar4>(ptr);
+	dest[0] = uint8x4.x; dest[1] = uint8x4.y; dest[2] = uint8x4.z; dest[3] = uint8x4.w;
+}
+
+template <>
+__device__ inline void load_uint8_vector<8u>(uint32_t *dest, const uint8_t *ptr)
+{
+	const auto uint32x2 = load_as<uint2>(ptr);
+	load_uint8_vector<4u>(dest + 0, reinterpret_cast<const uint8_t *>(&uint32x2.x));
+	load_uint8_vector<4u>(dest + 4, reinterpret_cast<const uint8_t *>(&uint32x2.y));
+}
+
+template <>
+__device__ inline void load_uint8_vector<16u>(uint32_t *dest, const uint8_t *ptr)
+{
+	const auto uint32x4 = load_as<uint4>(ptr);
+	load_uint8_vector<4u>(dest +  0, reinterpret_cast<const uint8_t *>(&uint32x4.x));
+	load_uint8_vector<4u>(dest +  4, reinterpret_cast<const uint8_t *>(&uint32x4.y));
+	load_uint8_vector<4u>(dest +  8, reinterpret_cast<const uint8_t *>(&uint32x4.z));
+	load_uint8_vector<4u>(dest + 12, reinterpret_cast<const uint8_t *>(&uint32x4.w));
+}
+
+
+template <unsigned int N>
+__device__ inline void store_uint8_vector(uint8_t *dest, const uint32_t *ptr);
+
+template <>
+__device__ inline void store_uint8_vector<1u>(uint8_t *dest, const uint32_t *ptr)
+{
+	dest[0] = static_cast<uint8_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void store_uint8_vector<2u>(uint8_t *dest, const uint32_t *ptr)
+{
+	uchar2 uint8x2;
+	uint8x2.x = static_cast<uint8_t>(ptr[0]);
+	uint8x2.y = static_cast<uint8_t>(ptr[1]);
+	store_as<uchar2>(dest, uint8x2);
+}
+
+template <>
+__device__ inline void store_uint8_vector<4u>(uint8_t *dest, const uint32_t *ptr)
+{
+	store_as<uint32_t>(dest, pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]));
+}
+
+template <>
+__device__ inline void store_uint8_vector<8u>(uint8_t *dest, const uint32_t *ptr)
+{
+	uint2 uint32x2;
+	uint32x2.x = pack_uint8x4(ptr[0], ptr[1], ptr[2], ptr[3]);
+	uint32x2.y = pack_uint8x4(ptr[4], ptr[5], ptr[6], ptr[7]);
+	store_as<uint2>(dest, uint32x2);
+}
+
+template <>
+__device__ inline void store_uint8_vector<16u>(uint8_t *dest, const uint32_t *ptr)
+{
+	uint4 uint32x4;
+	uint32x4.x = pack_uint8x4(ptr[ 0], ptr[ 1], ptr[ 2], ptr[ 3]);
+	uint32x4.y = pack_uint8x4(ptr[ 4], ptr[ 5], ptr[ 6], ptr[ 7]);
+	uint32x4.z = pack_uint8x4(ptr[ 8], ptr[ 9], ptr[10], ptr[11]);
+	uint32x4.w = pack_uint8x4(ptr[12], ptr[13], ptr[14], ptr[15]);
+	store_as<uint4>(dest, uint32x4);
+}
+
+
+template <unsigned int N>
+__device__ inline void load_uint16_vector(uint32_t *dest, const uint16_t *ptr);
+
+template <>
+__device__ inline void load_uint16_vector<1u>(uint32_t *dest, const uint16_t *ptr)
+{
+	dest[0] = static_cast<uint32_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void load_uint16_vector<2u>(uint32_t *dest, const uint16_t *ptr)
+{
+	const auto uint16x2 = load_as<ushort2>(ptr);
+	dest[0] = uint16x2.x; dest[1] = uint16x2.y;
+}
+
+template <>
+__device__ inline void load_uint16_vector<4u>(uint32_t *dest, const uint16_t *ptr)
+{
+	const auto uint16x4 = load_as<ushort4>(ptr);
+	dest[0] = uint16x4.x; dest[1] = uint16x4.y; dest[2] = uint16x4.z; dest[3] = uint16x4.w;
+}
+
+template <>
+__device__ inline void load_uint16_vector<8u>(uint32_t *dest, const uint16_t *ptr)
+{
+	const auto uint32x4 = load_as<uint4>(ptr);
+	load_uint16_vector<2u>(dest + 0, reinterpret_cast<const uint16_t *>(&uint32x4.x));
+	load_uint16_vector<2u>(dest + 2, reinterpret_cast<const uint16_t *>(&uint32x4.y));
+	load_uint16_vector<2u>(dest + 4, reinterpret_cast<const uint16_t *>(&uint32x4.z));
+	load_uint16_vector<2u>(dest + 6, reinterpret_cast<const uint16_t *>(&uint32x4.w));
+}
+
+
+template <unsigned int N>
+__device__ inline void store_uint16_vector(uint16_t *dest, const uint32_t *ptr);
+
+template <>
+__device__ inline void store_uint16_vector<1u>(uint16_t *dest, const uint32_t *ptr)
+{
+	dest[0] = static_cast<uint16_t>(ptr[0]);
+}
+
+template <>
+__device__ inline void store_uint16_vector<2u>(uint16_t *dest, const uint32_t *ptr)
+{
+	ushort2 uint16x2;
+	uint16x2.x = static_cast<uint16_t>(ptr[0]);
+	uint16x2.y = static_cast<uint16_t>(ptr[1]);
+	store_as<ushort2>(dest, uint16x2);
+}
+
+template <>
+__device__ inline void store_uint16_vector<4u>(uint16_t *dest, const uint32_t *ptr)
+{
+	ushort4 uint16x4;
+	uint16x4.x = static_cast<uint16_t>(ptr[0]);
+	uint16x4.y = static_cast<uint16_t>(ptr[1]);
+	uint16x4.z = static_cast<uint16_t>(ptr[2]);
+	uint16x4.w = static_cast<uint16_t>(ptr[3]);
+	store_as<ushort4>(dest, uint16x4);
+}
+
+template <>
+__device__ inline void store_uint16_vector<8u>(uint16_t *dest, const uint32_t *ptr)
+{
+	uint4 uint32x4;
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.x), &ptr[0]);
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.y), &ptr[2]);
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.z), &ptr[4]);
+	store_uint16_vector<2u>(reinterpret_cast<uint16_t *>(&uint32x4.w), &ptr[6]);
+	store_as<uint4>(dest, uint32x4);
+}
+
+template <>
+__device__ inline void store_uint16_vector<16u>(uint16_t *dest, const uint32_t *ptr)
+{
+	store_uint16_vector<8u>(dest + 0, ptr + 0);
+	store_uint16_vector<8u>(dest + 8, ptr + 8);
+}
+
+} // namespace sgm
+
+#endif // !__DEVICE_UTILITY_H__
@@ -0,0 +1,45 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __HOST_UTILITY_H__
+#define __HOST_UTILITY_H__
+
+#include <cstdio>
+#include <stdexcept>
+
+#define CUDA_CHECK(err) \
+do {\
+	if (err != cudaSuccess) { \
+		printf("[CUDA Error] %s (code: %d) at %s:%d\n", cudaGetErrorString(err), err, __FILE__, __LINE__); \
+	} \
+} while (0)
+
+#define SGM_ASSERT(expr, msg) \
+if (!(expr)) { \
+	throw std::logic_error(msg); \
+} \
+
+namespace sgm
+{
+
+static inline int divUp(int total, int grain)
+{
+	return (total + grain - 1) / grain;
+}
+
+} // namespace sgm
+
+#endif // !__HOST_UTILITY_H__
@@ -0,0 +1,48 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __INTERNAL_H__
+#define __INTERNAL_H__
+
+#include "libsgm.h"
+#include "device_image.h"
+
+namespace sgm
+{
+namespace details
+{
+
+void census_transform(const DeviceImage& src, DeviceImage& dst, CensusType type);
+
+void cost_aggregation(const DeviceImage& srcL, const DeviceImage& srcR, DeviceImage& dst,
+	int disp_size, int P1, int P2, PathType path_type, int min_disp);
+
+void winner_takes_all(const DeviceImage& src, DeviceImage& dstL, DeviceImage& dstR,
+	int disp_size, float uniqueness, bool subpixel, PathType path_type);
+
+void median_filter(const DeviceImage& src, DeviceImage& dst);
+
+void check_consistency(DeviceImage& dispL, const DeviceImage& dispR, const DeviceImage& srcL, bool subpixel, int LR_max_diff);
+
+void correct_disparity_range(DeviceImage& disp, bool subpixel, int min_disp);
+
+void cast_16bit_to_8bit(const DeviceImage& src, DeviceImage& dst);
+void cast_8bit_to_16bit(const DeviceImage& src, DeviceImage& dst);
+
+} // namespace details
+} // namespace sgm
+
+#endif // !__INTERNAL_H__
@@ -0,0 +1,218 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <libsgm.h>
+
+#include <iostream>
+
+#include "internal.h"
+#include "host_utility.h"
+
+namespace sgm
+{
+
+static bool has_enough_depth(int dst_depth, int disparity_size, int min_disp, bool subpixel)
+{
+	// simulate minimum/maximum value
+	int64_t max = static_cast<int64_t>(disparity_size) + min_disp - 1;
+	if (subpixel) {
+		max *= sgm::StereoSGM::SUBPIXEL_SCALE;
+		max += sgm::StereoSGM::SUBPIXEL_SCALE - 1;
+	}
+
+	if (1ll << dst_depth <= max)
+		return false;
+
+	if (min_disp <= 0) {
+		// whether or not output can be represented by signed
+		int64_t min = static_cast<int64_t>(min_disp) - 1;
+		if (subpixel) {
+			min *= sgm::StereoSGM::SUBPIXEL_SCALE;
+		}
+
+		if (min < -(1ll << (dst_depth - 1))
+			|| 1ll << (dst_depth - 1) <= max)
+			return false;
+	}
+
+	return true;
+}
+
+class StereoSGM::Impl
+{
+public:
+
+	Impl(int width, int height, int disparity_size, int src_depth, int dst_depth, int src_pitch, int dst_pitch,
+		ExecuteInOut inout_type, const Parameters& param) :
+		width_(width),
+		height_(height),
+		disp_size_(disparity_size),
+		src_pitch_(src_pitch),
+		dst_pitch_(dst_pitch),
+		param_(param)
+	{
+		// check values
+		SGM_ASSERT(src_depth == 8 || src_depth == 16 || src_depth == 32, "src depth bits must be 8, 16 or 32");
+		SGM_ASSERT(dst_depth == 8 || dst_depth == 16, "dst depth bits must be 8 or 16");
+		SGM_ASSERT(disparity_size == 64 || disparity_size == 128 || disparity_size == 256, "disparity size must be 64 or 128 or 256");
+		SGM_ASSERT(has_enough_depth(dst_depth, disparity_size, param_.min_disp, param_.subpixel),
+			"output depth bits must be sufficient for representing output value");
+
+		src_type_ = src_depth == 8 ? SGM_8U : src_depth == 16 ? SGM_16U : SGM_32U;
+		dst_type_ = dst_depth == 8 ? SGM_8U : SGM_16U;
+
+		is_src_devptr_ = (inout_type & 0x01) > 0;
+		is_dst_devptr_ = (inout_type & 0x02) > 0;
+
+		if (!is_src_devptr_) {
+			d_srcL_.create(height, width, src_type_, src_pitch);
+			d_srcR_.create(height, width, src_type_, src_pitch);
+		}
+
+		const ImageType census_type = param.census_type == CensusType::CENSUS_9x7 ? SGM_64U : SGM_32U;
+		d_censusL_.create(height, width, census_type);
+		d_censusR_.create(height, width, census_type);
+		d_censusL_.fill_zero();
+		d_censusR_.fill_zero();
+
+		d_tmpL_.create(height, width, SGM_16U, dst_pitch);
+		d_tmpR_.create(height, width, SGM_16U, dst_pitch);
+
+		if (!(is_dst_devptr_ && dst_type_ == SGM_16U)) {
+			d_dispL_.create(height, width, SGM_16U, dst_pitch);
+		}
+		d_dispR_.create(height, width, SGM_16U, dst_pitch);
+	}
+
+	void execute(const void* srcL, const void* srcR, void* dst)
+	{
+		if (is_src_devptr_) {
+			d_srcL_.create((void*)srcL, height_, width_, src_type_, src_pitch_);
+			d_srcR_.create((void*)srcR, height_, width_, src_type_, src_pitch_);
+		}
+		else {
+			d_srcL_.upload(srcL);
+			d_srcR_.upload(srcR);
+		}
+		if (is_dst_devptr_ && dst_type_ == SGM_16U) {
+			// when threre is no device-host copy or type conversion, use passed buffer
+			d_dispL_.create((void*)dst, height_, width_, SGM_16U, dst_pitch_);
+		}
+
+		// census transform
+		details::census_transform(d_srcL_, d_censusL_, param_.census_type);
+		details::census_transform(d_srcR_, d_censusR_, param_.census_type);
+
+		// cost aggregation
+		details::cost_aggregation(d_censusL_, d_censusR_, d_cost_, disp_size_,
+			param_.P1, param_.P2, param_.path_type, param_.min_disp);
+
+		// winner-takes-all
+		details::winner_takes_all(d_cost_, d_tmpL_, d_tmpR_, disp_size_,
+			param_.uniqueness, param_.subpixel, param_.path_type);
+
+		// post filtering
+		details::median_filter(d_tmpL_, d_dispL_);
+		details::median_filter(d_tmpR_, d_dispR_);
+
+		// consistency check
+		details::check_consistency(d_dispL_, d_dispR_, d_srcL_, param_.subpixel, param_.LR_max_diff);
+		details::correct_disparity_range(d_dispL_, param_.subpixel, param_.min_disp);
+
+		if (!is_dst_devptr_ && dst_type_ == SGM_8U) {
+			details::cast_16bit_to_8bit(d_dispL_, d_tmpL_);
+			d_tmpL_.download(dst);
+		}
+		else if (is_dst_devptr_ && dst_type_ == SGM_8U) {
+			DeviceImage d_dst(dst, height_, width_, SGM_8U, dst_pitch_);
+			details::cast_16bit_to_8bit(d_dispL_, d_dst);
+		}
+		else if (!is_dst_devptr_ && dst_type_ == SGM_16U) {
+			d_dispL_.download(dst);
+		}
+		else if (is_dst_devptr_ && dst_type_ == SGM_16U) {
+			// optimize! no-copy!
+		}
+		else {
+			std::cerr << "not impl" << std::endl;
+		}
+	}
+
+	int get_invalid_disparity() const
+	{
+		return (param_.min_disp - 1) * (param_.subpixel ? SUBPIXEL_SCALE : 1);
+	}
+
+private:
+
+	int width_;
+	int height_;
+	int disp_size_;
+	int src_pitch_;
+	int dst_pitch_;
+	Parameters param_;
+
+	ImageType src_type_;
+	ImageType dst_type_;
+	bool is_src_devptr_;
+	bool is_dst_devptr_;
+
+	DeviceImage d_srcL_;
+	DeviceImage d_srcR_;
+	DeviceImage d_censusL_;
+	DeviceImage d_censusR_;
+	DeviceImage d_cost_;
+	DeviceImage d_tmpL_;
+	DeviceImage d_tmpR_;
+	DeviceImage d_dispL_;
+	DeviceImage d_dispR_;
+};
+
+StereoSGM::Parameters::Parameters(int P1, int P2, float uniqueness, bool subpixel, PathType path_type,
+	int min_disp, int LR_max_diff, CensusType census_type)
+	: P1(P1), P2(P2), uniqueness(uniqueness), subpixel(subpixel), path_type(path_type),
+	min_disp(min_disp), LR_max_diff(LR_max_diff), census_type(census_type)
+{
+}
+
+StereoSGM::StereoSGM(int width, int height, int disparity_size, int src_depth, int dst_depth,
+	ExecuteInOut inout_type, const Parameters& param)
+{
+	impl_ = new Impl(width, height, disparity_size, src_depth, dst_depth, width, width, inout_type, param);
+}
+
+StereoSGM::StereoSGM(int width, int height, int disparity_size, int src_depth, int dst_depth, int src_pitch, int dst_pitch,
+	ExecuteInOut inout_type, const Parameters& param)
+{
+	impl_ = new Impl(width, height, disparity_size, src_depth, dst_depth, src_pitch, dst_pitch, inout_type, param);
+}
+
+StereoSGM::~StereoSGM()
+{
+	delete impl_;
+}
+
+void StereoSGM::execute(const void* srcL, const void* srcR, void* dst)
+{
+	impl_->execute(srcL, srcR, dst);
+}
+
+int StereoSGM::get_invalid_disparity() const
+{
+	return impl_->get_invalid_disparity();
+}
+
+} // namespace sgm
@@ -0,0 +1,145 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include <libsgm_wrapper.h>
+
+namespace sgm
+{
+
+LibSGMWrapper::LibSGMWrapper(int numDisparity, int P1, int P2, float uniquenessRatio, bool subpixel, PathType pathType, int minDisparity, int lrMaxDiff, CensusType censusType)
+	: sgm_(nullptr), numDisparity_(numDisparity), param_(P1, P2, uniquenessRatio, subpixel, pathType, minDisparity, lrMaxDiff, censusType), prev_(nullptr) {}
+LibSGMWrapper::~LibSGMWrapper() = default;
+
+int LibSGMWrapper::getNumDisparities() const { return numDisparity_; }
+float LibSGMWrapper::getUniquenessRatio() const { return param_.uniqueness; }
+int LibSGMWrapper::getP1() const { return param_.P1; }
+int LibSGMWrapper::getP2() const { return param_.P2; }
+bool LibSGMWrapper::hasSubpixel() const { return param_.subpixel; }
+PathType LibSGMWrapper::getPathType() const { return param_.path_type; }
+int LibSGMWrapper::getMinDisparity() const { return param_.min_disp; }
+int LibSGMWrapper::getLrMaxDiff() const { return param_.LR_max_diff; }
+CensusType LibSGMWrapper::getCensusType() const { return param_.census_type; }
+int LibSGMWrapper::getInvalidDisparity() const
+{
+	return (param_.min_disp - 1) * (param_.subpixel ? StereoSGM::SUBPIXEL_SCALE : 1);
+}
+
+struct LibSGMWrapper::Creator
+{
+	int width;
+	int height;
+	int src_pitch;
+	int dst_pitch;
+	int input_depth_bits;
+	int output_depth_bits;
+	sgm::ExecuteInOut inout_type;
+
+	bool operator==(const Creator& rhs) const
+	{
+		return
+			width == rhs.width
+			&& height == rhs.height
+			&& src_pitch == rhs.src_pitch
+			&& dst_pitch == rhs.dst_pitch
+			&& input_depth_bits == rhs.input_depth_bits
+			&& output_depth_bits == rhs.output_depth_bits
+			&& inout_type == rhs.inout_type;
+	}
+
+	bool operator!=(const Creator& rhs) const
+	{
+		return !(*this == rhs);
+	}
+
+	StereoSGM* createStereoSGM(int disparity_size, const StereoSGM::Parameters& param)
+	{
+		return new StereoSGM(width, height, disparity_size, input_depth_bits, output_depth_bits, src_pitch, dst_pitch, inout_type, param);
+	}
+
+#ifdef BUILD_OPENCV_WRAPPER
+
+	Creator(const cv::cuda::GpuMat& src, const cv::cuda::GpuMat& dst)
+	{
+		const int depth = src.depth();
+		CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
+		width = src.cols;
+		height = src.rows;
+		src_pitch = static_cast<int>(src.step1());
+		dst_pitch = static_cast<int>(dst.step1());
+		input_depth_bits = static_cast<int>(src.elemSize1()) * 8;
+		output_depth_bits = static_cast<int>(dst.elemSize1()) * 8;
+		inout_type = sgm::EXECUTE_INOUT_CUDA2CUDA;
+	}
+
+	Creator(const cv::Mat& src, const cv::Mat& dst)
+	{
+		const int depth = src.depth();
+		CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
+		width = src.cols;
+		height = src.rows;
+		src_pitch = static_cast<int>(src.step1());
+		dst_pitch = static_cast<int>(dst.step1());
+		input_depth_bits = static_cast<int>(src.elemSize1()) * 8;
+		output_depth_bits = static_cast<int>(dst.elemSize1()) * 8;
+		inout_type = sgm::EXECUTE_INOUT_HOST2HOST;
+	}
+
+#endif // BUILD_OPRENCV_WRAPPER
+};
+
+#ifdef BUILD_OPENCV_WRAPPER
+
+void LibSGMWrapper::execute(const cv::cuda::GpuMat& I1, const cv::cuda::GpuMat& I2, cv::cuda::GpuMat& disparity)
+{
+	const cv::Size size = I1.size();
+	CV_Assert(size == I2.size());
+	CV_Assert(I1.type() == I2.type());
+	const int depth = I1.depth();
+	CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
+	if (disparity.size() != size || disparity.depth() != CV_16S) {
+		disparity.create(size, CV_16S);
+	}
+	std::unique_ptr<Creator> creator(new Creator(I1, disparity));
+	if (!sgm_ || !prev_ || *creator != *prev_) {
+		sgm_.reset(creator->createStereoSGM(numDisparity_, param_));
+	}
+	prev_ = std::move(creator);
+
+	sgm_->execute(I1.data, I2.data, disparity.data);
+}
+
+void LibSGMWrapper::execute(const cv::Mat& I1, const cv::Mat& I2, cv::Mat& disparity)
+{
+	const cv::Size size = I1.size();
+	CV_Assert(size == I2.size());
+	CV_Assert(I1.type() == I2.type());
+	const int depth = I1.depth();
+	CV_Assert(depth == CV_8U || depth == CV_16U || depth == CV_32S);
+	if (disparity.size() != size || disparity.depth() != CV_16S) {
+		disparity.create(size, CV_16S);
+	}
+	std::unique_ptr<Creator> creator(new Creator(I1, disparity));
+	if (!sgm_ || !prev_ || *creator != *prev_) {
+		sgm_.reset(creator->createStereoSGM(numDisparity_, param_));
+	}
+	prev_ = std::move(creator);
+
+	sgm_->execute(I1.data, I2.data, disparity.data);
+}
+
+#endif // BUILD_OPENCV_WRAPPER
+
+} // namespace sgm
@@ -0,0 +1,295 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "host_utility.h"
+
+namespace
+{
+
+const int BLOCK_X = 16;
+const int BLOCK_Y = 16;
+const int KSIZE = 3;
+const int RADIUS = KSIZE / 2;
+const int KSIZE_SQ = KSIZE * KSIZE;
+
+template <typename T>
+__device__ inline void swap(T& x, T& y)
+{
+	T tmp(x);
+	x = y;
+	y = tmp;
+}
+
+// sort, min, max of 1 element
+template <typename T, int V = 1> __device__ inline void dev_sort(T& x, T& y) { if (x > y) swap(x, y); }
+template <typename T, int V = 1> __device__ inline void dev_min(T& x, T& y) { x = min(x, y); }
+template <typename T, int V = 1> __device__ inline void dev_max(T& x, T& y) { y = max(x, y); }
+
+// sort, min, max of 2 elements
+__device__ inline void dev_sort_2(uint32_t& x, uint32_t& y)
+{
+	const uint32_t mask = __vcmpgtu2(x, y);
+	const uint32_t tmp = (x ^ y) & mask;
+	x ^= tmp;
+	y ^= tmp;
+}
+__device__ inline void dev_min_2(uint32_t& x, uint32_t& y) { x = __vminu2(x, y); }
+__device__ inline void dev_max_2(uint32_t& x, uint32_t& y) { y = __vmaxu2(x, y); }
+
+template <> __device__ inline void dev_sort<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_sort_2(x, y); }
+template <> __device__ inline void dev_min<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_min_2(x, y); }
+template <> __device__ inline void dev_max<uint32_t, 2>(uint32_t& x, uint32_t& y) { dev_max_2(x, y); }
+
+// sort, min, max of 4 elements
+__device__ inline void dev_sort_4(uint32_t& x, uint32_t& y)
+{
+	const uint32_t mask = __vcmpgtu4(x, y);
+	const uint32_t tmp = (x ^ y) & mask;
+	x ^= tmp;
+	y ^= tmp;
+}
+__device__ inline void dev_min_4(uint32_t& x, uint32_t& y) { x = __vminu4(x, y); }
+__device__ inline void dev_max_4(uint32_t& x, uint32_t& y) { y = __vmaxu4(x, y); }
+
+template <> __device__ inline void dev_sort<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_sort_4(x, y); }
+template <> __device__ inline void dev_min<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_min_4(x, y); }
+template <> __device__ inline void dev_max<uint32_t, 4>(uint32_t& x, uint32_t& y) { dev_max_4(x, y); }
+
+template <typename T, int V = 1>
+__device__ inline void median_selection_network_9(T* buf)
+{
+#define SWAP_OP(i, j) dev_sort<T, V>(buf[i], buf[j])
+#define MIN_OP(i, j) dev_min<T, V>(buf[i], buf[j])
+#define MAX_OP(i, j) dev_max<T, V>(buf[i], buf[j])
+
+	SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
+	SWAP_OP(1, 2); SWAP_OP(4, 5); SWAP_OP(7, 8);
+	SWAP_OP(0, 1); SWAP_OP(3, 4); SWAP_OP(6, 7);
+	MAX_OP(0, 3); MAX_OP(3, 6);
+	SWAP_OP(1, 4); MIN_OP(4, 7); MAX_OP(1, 4);
+	MIN_OP(5, 8); MIN_OP(2, 5);
+	SWAP_OP(2, 4); MIN_OP(4, 6); MAX_OP(2, 4);
+
+#undef SWAP_OP
+#undef MIN_OP
+#undef MAX_OP
+}
+
+template <typename T, int V = 1>
+__device__ inline T median(T* buf)
+{
+	median_selection_network_9<T, V>(buf);
+	return buf[KSIZE_SQ / 2];
+}
+
+__global__ void median_kernel_3x3_8u(const uint8_t* src, uint8_t* dst, int w, int h, int p)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if (x >= w || y >= h)
+		return;
+
+	if (x >= RADIUS && x < w - RADIUS && y >= RADIUS && y < h - RADIUS) {
+		uint8_t buf[KSIZE_SQ];
+		for (int i = 0; i < KSIZE_SQ; i++)
+			buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
+		dst[y * p + x] = median(buf);
+	}
+	else {
+		dst[y * p + x] = 0;
+	}
+}
+
+__global__ void median_kernel_3x3_16u(const uint16_t* src, uint16_t* dst, int w, int h, int p)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+	if (x >= w || y >= h)
+		return;
+
+	if (x >= RADIUS && x < w - RADIUS && y >= RADIUS && y < h - RADIUS) {
+		uint16_t buf[KSIZE_SQ];
+		for (int i = 0; i < KSIZE_SQ; i++)
+			buf[i] = src[(y - RADIUS + i / KSIZE) * p + (x - RADIUS + i % KSIZE)];
+		dst[y * p + x] = median(buf);
+	}
+	else {
+		dst[y * p + x] = 0;
+	}
+}
+
+__global__ void median_kernel_3x3_8u_v4(const uint8_t* src, uint8_t* dst, int w, int h, int pitch)
+{
+	const int x_4 = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if (y >= h)
+		return;
+
+	if (y < RADIUS || y >= h - RADIUS) {
+		for (int x = x_4; x < min(x_4 + 4, w); x++)
+			dst[y * pitch + x] = 0;
+		return;
+	}
+
+	uint32_t buf[KSIZE_SQ];
+	if (x_4 >= 4 && x_4 + 7 < w)
+	{
+		buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 4]);
+		buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 - 0]);
+		buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_4 + 4]);
+
+		buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 4]);
+		buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 - 0]);
+		buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_4 + 4]);
+
+		buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 4]);
+		buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 - 0]);
+		buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_4 + 4]);
+
+		buf[0] = (buf[1] << 8) | (buf[0] >> 24);
+		buf[2] = (buf[1] >> 8) | (buf[2] << 24);
+
+		buf[3] = (buf[4] << 8) | (buf[3] >> 24);
+		buf[5] = (buf[4] >> 8) | (buf[5] << 24);
+
+		buf[6] = (buf[7] << 8) | (buf[6] >> 24);
+		buf[8] = (buf[7] >> 8) | (buf[8] << 24);
+
+		*((uint32_t*)&dst[y * pitch + x_4]) = median<uint32_t, 4>(buf);
+	}
+	else if (x_4 < w) {
+
+		for (int x = x_4; x < min(x_4 + 4, w); x++) {
+			if (x >= RADIUS && x < w - RADIUS) {
+				uint8_t* buf_u8 = (uint8_t*)buf;
+				for (int i = 0; i < KSIZE_SQ; i++)
+					buf_u8[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
+				dst[y * pitch + x] = median(buf_u8);
+			}
+			else {
+				dst[y * pitch + x] = 0;
+			}
+		}
+	}
+}
+
+__global__ void median_kernel_3x3_16u_v2(const uint16_t* src, uint16_t* dst, int w, int h, int pitch)
+{
+	const int x_2 = 2 * (blockIdx.x * blockDim.x + threadIdx.x);
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if (y >= h)
+		return;
+
+	if (y < RADIUS || y >= h - RADIUS) {
+		for (int x = x_2; x < min(x_2 + 2, w); x++)
+			dst[y * pitch + x] = 0;
+		return;
+	}
+
+	uint32_t buf[KSIZE_SQ];
+	if (x_2 >= 2 && x_2 + 3 < w)
+	{
+		buf[0] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 2]);
+		buf[1] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 - 0]);
+		buf[2] = *((const uint32_t*)&src[(y - 1) * pitch + x_2 + 2]);
+
+		buf[3] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 2]);
+		buf[4] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 - 0]);
+		buf[5] = *((const uint32_t*)&src[(y - 0) * pitch + x_2 + 2]);
+
+		buf[6] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 2]);
+		buf[7] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 - 0]);
+		buf[8] = *((const uint32_t*)&src[(y + 1) * pitch + x_2 + 2]);
+
+		buf[0] = (buf[1] << 16) | (buf[0] >> 16);
+		buf[2] = (buf[1] >> 16) | (buf[2] << 16);
+
+		buf[3] = (buf[4] << 16) | (buf[3] >> 16);
+		buf[5] = (buf[4] >> 16) | (buf[5] << 16);
+
+		buf[6] = (buf[7] << 16) | (buf[6] >> 16);
+		buf[8] = (buf[7] >> 16) | (buf[8] << 16);
+
+		*((uint32_t*)&dst[y * pitch + x_2]) = median<uint32_t, 2>(buf);
+	}
+	else if (x_2 < w) {
+
+		for (int x = x_2; x < min(x_2 + 2, w); x++) {
+			if (x >= RADIUS && x < w - RADIUS) {
+				uint16_t* buf_u16 = (uint16_t*)buf;
+				for (int i = 0; i < KSIZE_SQ; i++)
+					buf_u16[i] = src[(y - RADIUS + i / KSIZE) * pitch + (x - RADIUS + i % KSIZE)];
+				dst[y * pitch + x] = median(buf_u16);
+			}
+			else {
+				dst[y * pitch + x] = 0;
+			}
+		}
+	}
+}
+
+} // namespace
+
+namespace sgm
+{
+namespace details
+{
+
+void median_filter(const DeviceImage& src, DeviceImage& dst)
+{
+	const int w = src.cols;
+	const int h = src.rows;
+	const int pitch = src.step;
+
+	dst.create(h, w, src.type, src.step);
+
+	const dim3 block(BLOCK_X, BLOCK_Y);
+
+	if (src.type == SGM_8U) {
+		using T = uint8_t;
+		if (pitch % 4 == 0) {
+			const dim3 grid(divUp(divUp(w, 4), block.x), divUp(h, block.y));
+			median_kernel_3x3_8u_v4<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
+		}
+		else {
+			const dim3 grid(divUp(w, block.x), divUp(h, block.y));
+			median_kernel_3x3_8u<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
+		}
+	}
+	else if (src.type == SGM_16U) {
+		using T = uint16_t;
+		if (pitch % 2 == 0) {
+			const dim3 grid(divUp(divUp(w, 2), block.x), divUp(h, block.y));
+			median_kernel_3x3_16u_v2<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
+		}
+		else {
+			const dim3 grid(divUp(w, block.x), divUp(h, block.y));
+			median_kernel_3x3_16u<<<grid, block>>>(src.ptr<T>(), dst.ptr<T>(), w, h, pitch);
+		}
+	}
+
+	CUDA_CHECK(cudaGetLastError());
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,30 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#ifndef __TYPES_H__
+#define __TYPES_H__
+
+#include <cstdint>
+
+namespace sgm
+{
+
+using cost_type = uint8_t;
+using output_type = uint16_t;
+
+} // namespace sgm
+
+#endif // !__TYPES_H__
@@ -0,0 +1,270 @@
+/*
+Copyright 2016 Fixstars Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http ://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+#include "internal.h"
+
+#include <cuda_runtime.h>
+
+#include "device_utility.h"
+#include "host_utility.h"
+
+namespace sgm
+{
+namespace
+{
+
+static constexpr unsigned int WARPS_PER_BLOCK = 8u;
+static constexpr unsigned int BLOCK_SIZE = WARPS_PER_BLOCK * WARP_SIZE;
+
+__device__ inline uint32_t pack_cost_index(uint32_t cost, uint32_t index)
+{
+	union {
+		uint32_t uint32;
+		ushort2 uint16x2;
+	} u;
+	u.uint16x2.x = static_cast<uint16_t>(index);
+	u.uint16x2.y = static_cast<uint16_t>(cost);
+	return u.uint32;
+}
+
+__device__ uint32_t unpack_cost(uint32_t packed)
+{
+	return packed >> 16;
+}
+
+__device__ int unpack_index(uint32_t packed)
+{
+	return packed & 0xffffu;
+}
+
+using ComputeDisparity = uint32_t(*)(uint32_t, uint32_t, uint16_t*);
+
+__device__ inline uint32_t compute_disparity_normal(uint32_t disp, uint32_t cost = 0, uint16_t* smem = nullptr)
+{
+	return disp;
+}
+
+template <size_t MAX_DISPARITY>
+__device__ inline uint32_t compute_disparity_subpixel(uint32_t disp, uint32_t cost, uint16_t* smem)
+{
+	int subp = disp;
+	subp <<= sgm::StereoSGM::SUBPIXEL_SHIFT;
+	if (disp > 0 && disp < MAX_DISPARITY - 1) {
+		const int left = smem[disp - 1];
+		const int right = smem[disp + 1];
+		const int numer = left - right;
+		const int denom = left - 2 * cost + right;
+		subp += ((numer << sgm::StereoSGM::SUBPIXEL_SHIFT) + denom) / (2 * denom);
+	}
+	return subp;
+}
+
+
+template <unsigned int MAX_DISPARITY, unsigned int NUM_PATHS, ComputeDisparity compute_disparity = compute_disparity_normal>
+__global__ void winner_takes_all_kernel(
+	output_type *left_dest,
+	output_type *right_dest,
+	const cost_type *src,
+	int width,
+	int height,
+	int pitch,
+	float uniqueness)
+{
+	static const unsigned int ACCUMULATION_PER_THREAD = 16u;
+	static const unsigned int REDUCTION_PER_THREAD = MAX_DISPARITY / WARP_SIZE;
+	static const unsigned int ACCUMULATION_INTERVAL = ACCUMULATION_PER_THREAD / REDUCTION_PER_THREAD;
+	static const unsigned int UNROLL_DEPTH = 
+		(REDUCTION_PER_THREAD > ACCUMULATION_INTERVAL)
+			? REDUCTION_PER_THREAD
+			: ACCUMULATION_INTERVAL;
+
+	const size_t cost_step = static_cast<size_t>(MAX_DISPARITY) * width * height;
+	const unsigned int warp_id = threadIdx.x / WARP_SIZE;
+	const unsigned int lane_id = threadIdx.x % WARP_SIZE;
+
+	const unsigned int y = blockIdx.x * WARPS_PER_BLOCK + warp_id;
+	src += y * MAX_DISPARITY * width;
+	left_dest  += y * pitch;
+	right_dest += y * pitch;
+
+	if(y >= height){
+		return;
+	}
+
+	__shared__ uint16_t smem_cost_sum[WARPS_PER_BLOCK][ACCUMULATION_INTERVAL][MAX_DISPARITY];
+
+	uint32_t right_best[REDUCTION_PER_THREAD];
+	for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+		right_best[i] = 0xffffffffu;
+	}
+
+	for(unsigned int x0 = 0; x0 < width; x0 += UNROLL_DEPTH){
+#pragma unroll
+		for(unsigned int x1 = 0; x1 < UNROLL_DEPTH; ++x1){
+			if(x1 % ACCUMULATION_INTERVAL == 0){
+				const unsigned int k = lane_id * ACCUMULATION_PER_THREAD;
+				const unsigned int k_hi = k / MAX_DISPARITY;
+				const unsigned int k_lo = k % MAX_DISPARITY;
+				const unsigned int x = x0 + x1 + k_hi;
+				if(x < width){
+					const unsigned int offset = x * MAX_DISPARITY + k_lo;
+					uint32_t sum[ACCUMULATION_PER_THREAD];
+					for(unsigned int i = 0; i < ACCUMULATION_PER_THREAD; ++i){
+						sum[i] = 0;
+					}
+					for(unsigned int p = 0; p < NUM_PATHS; ++p){
+						uint32_t load_buffer[ACCUMULATION_PER_THREAD];
+						load_uint8_vector<ACCUMULATION_PER_THREAD>(
+							load_buffer, &src[p * cost_step + offset]);
+						for(unsigned int i = 0; i < ACCUMULATION_PER_THREAD; ++i){
+							sum[i] += load_buffer[i];
+						}
+					}
+					store_uint16_vector<ACCUMULATION_PER_THREAD>(
+						&smem_cost_sum[warp_id][k_hi][k_lo], sum);
+				}
+#if CUDA_VERSION >= 9000
+				__syncwarp();
+#else
+				__threadfence_block();
+#endif
+			}
+			const unsigned int x = x0 + x1;
+			if(x < width){
+				// Load sum of costs
+				const unsigned int smem_x = x1 % ACCUMULATION_INTERVAL;
+				const unsigned int k0 = lane_id * REDUCTION_PER_THREAD;
+				uint32_t local_cost_sum[REDUCTION_PER_THREAD];
+				load_uint16_vector<REDUCTION_PER_THREAD>(
+					local_cost_sum, &smem_cost_sum[warp_id][smem_x][k0]);
+				// Pack sum of costs and dispairty
+				uint32_t local_packed_cost[REDUCTION_PER_THREAD];
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					local_packed_cost[i] = pack_cost_index(local_cost_sum[i], k0 + i);
+				}
+				// Update left
+				uint32_t best = 0xffffffffu;
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					best = min(best, local_packed_cost[i]);
+				}
+				best = subgroup_min<WARP_SIZE>(best, 0xffffffffu);
+				// Update right
+#pragma unroll
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					const unsigned int k = lane_id * REDUCTION_PER_THREAD + i;
+					const int p = static_cast<int>(((x - k) & ~(MAX_DISPARITY - 1)) + k);
+					const unsigned int d = static_cast<unsigned int>(x - p);
+#if CUDA_VERSION >= 9000
+					const uint32_t recv = __shfl_sync(0xffffffffu,
+						local_packed_cost[(REDUCTION_PER_THREAD - i + x1) % REDUCTION_PER_THREAD],
+						d / REDUCTION_PER_THREAD,
+						WARP_SIZE);
+#else
+					const uint32_t recv = __shfl(
+						local_packed_cost[(REDUCTION_PER_THREAD - i + x1) % REDUCTION_PER_THREAD],
+						d / REDUCTION_PER_THREAD,
+						WARP_SIZE);
+#endif
+					right_best[i] = min(right_best[i], recv);
+					if(d == MAX_DISPARITY - 1){
+						if(0 <= p){
+							right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]));
+						}
+						right_best[i] = 0xffffffffu;
+					}
+				}
+				// Resume updating left to avoid execution dependency
+				const uint32_t bestCost = unpack_cost(best);
+				const int bestDisp = unpack_index(best);
+				bool uniq = true;
+				for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+					const uint32_t x = local_packed_cost[i];
+					const bool uniq1 = unpack_cost(x) * uniqueness >= bestCost;
+					const bool uniq2 = abs(unpack_index(x) - bestDisp) <= 1;
+					uniq &= uniq1 || uniq2;
+				}
+				uniq = subgroup_and<WARP_SIZE>(uniq, 0xffffffffu);
+				if(lane_id == 0){
+					left_dest[x] = uniq ? compute_disparity(bestDisp, bestCost, smem_cost_sum[warp_id][smem_x]) : INVALID_DISP;
+				}
+			}
+		}
+	}
+	for(unsigned int i = 0; i < REDUCTION_PER_THREAD; ++i){
+		const unsigned int k = lane_id * REDUCTION_PER_THREAD + i;
+		const int p = static_cast<int>(((width - k) & ~(MAX_DISPARITY - 1)) + k);
+		if(0 <= p && p < width){
+			right_dest[p] = compute_disparity_normal(unpack_index(right_best[i]));
+		}
+	}
+}
+
+} // namespace
+
+namespace details
+{
+
+template <int MAX_DISPARITY>
+void winner_takes_all_(const DeviceImage& src, DeviceImage& dstL, DeviceImage& dstR,
+	float uniqueness, bool subpixel, PathType path_type)
+{
+	const int width = dstL.cols;
+	const int height = dstL.rows;
+	const int pitch = dstL.step;
+
+	const int gdim = divUp(height, WARPS_PER_BLOCK);
+	const int bdim = BLOCK_SIZE;
+
+	const cost_type* cost = src.ptr<cost_type>();
+	output_type* dispL = dstL.ptr<output_type>();
+	output_type* dispR = dstR.ptr<output_type>();
+
+	if (subpixel && path_type == PathType::SCAN_8PATH) {
+		winner_takes_all_kernel<MAX_DISPARITY, 8, compute_disparity_subpixel<MAX_DISPARITY>><<<gdim, bdim>>>(
+			dispL, dispR, cost, width, height, pitch, uniqueness);
+	}
+	else if (subpixel && path_type == PathType::SCAN_4PATH) {
+		winner_takes_all_kernel<MAX_DISPARITY, 4, compute_disparity_subpixel<MAX_DISPARITY>><<<gdim, bdim>>>(
+			dispL, dispR, cost, width, height, pitch, uniqueness);
+	}
+	else if (!subpixel && path_type == PathType::SCAN_8PATH) {
+		winner_takes_all_kernel<MAX_DISPARITY, 8, compute_disparity_normal><<<gdim, bdim>>>(
+			dispL, dispR, cost, width, height, pitch, uniqueness);
+	}
+	else /* if (!subpixel && path_type == PathType::SCAN_4PATH) */ {
+		winner_takes_all_kernel<MAX_DISPARITY, 4, compute_disparity_normal><<<gdim, bdim>>>(
+			dispL, dispR, cost, width, height, pitch, uniqueness);
+	}
+
+	CUDA_CHECK(cudaGetLastError());
+}
+
+void winner_takes_all(const DeviceImage& src, DeviceImage& dstL, DeviceImage& dstR,
+	int disp_size, float uniqueness, bool subpixel, PathType path_type)
+{
+	if (disp_size == 64) {
+		winner_takes_all_<64>(src, dstL, dstR, uniqueness, subpixel, path_type);
+	}
+	else if (disp_size == 128) {
+		winner_takes_all_<128>(src, dstL, dstR, uniqueness, subpixel, path_type);
+	}
+	else if (disp_size == 256) {
+		winner_takes_all_<256>(src, dstL, dstR, uniqueness, subpixel, path_type);
+	}
+}
+
+} // namespace details
+} // namespace sgm
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 3.18)
+
+project(sgm-test LANGUAGES CXX CUDA)
+
+set(LIBSGM_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src)
+
+# required packages
+find_package(CUDAToolkit REQUIRED)
+find_package(OpenCV REQUIRED)
+
+if (MSVC)
+	option(gtest_force_shared_crt "Force Gmock to use standard compiler flags" ON)
+endif()
+
+add_subdirectory(googletest)
+
+file(GLOB SRCS ./*.cpp ./*.cu ./*.h*)
+add_executable(sgm-test ${SRCS})
+
+target_compile_features(sgm-test PRIVATE cxx_std_17)
+target_include_directories(sgm-test PRIVATE ${LIBSGM_SOURCE_DIR} ${gtest_SOURCE_DIR}/include ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(sgm-test sgm gtest ${OpenCV_LIBS})
+
+target_compile_options(
+	sgm-test PRIVATE
+	$<$<CXX_COMPILER_ID:GCC>:-O3 -Wall>
+	$<$<CXX_COMPILER_ID:Clang>:-O3 -Wall>
+	$<$<CXX_COMPILER_ID:MSVC>:/wd4819>
+	$<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
+)
@@ -0,0 +1,92 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+
+#include "host_image.h"
+#include "device_image.h"
+#include "test_utility.h"
+#include "internal.h"
+#include "constants.h"
+
+namespace sgm
+{
+
+void cast_16bit_to_8bit(const HostImage& src, HostImage& dst)
+{
+	const int h = src.rows;
+	const int w = dst.cols;
+
+	dst.create(h, w, SGM_8U);
+
+	for (int y = 0; y < h; y++)
+	{
+		const uint16_t* ptrSrc = src.ptr<uint16_t>(y);
+		uint8_t* ptrDst = dst.ptr<uint8_t>(y);
+		for (int x = 0; x < w; x++)
+			ptrDst[x] = static_cast<uint8_t>(ptrSrc[x]);
+	}
+}
+
+void cast_8bit_to_16bit(const HostImage& src, HostImage& dst)
+{
+	const int h = src.rows;
+	const int w = dst.cols;
+
+	dst.create(h, w, SGM_16U);
+
+	for (int y = 0; y < h; y++)
+	{
+		const uint8_t* ptrSrc = src.ptr<uint8_t>(y);
+		uint16_t* ptrDst = dst.ptr<uint16_t>(y);
+		for (int x = 0; x < w; x++)
+			ptrDst[x] = static_cast<uint16_t>(ptrSrc[x]);
+	}
+}
+
+} // namespace sgm
+
+TEST(CastTest, RandomU16ToU8)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_16U;
+	const ImageType dtype = SGM_8U;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype, pitch);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype, pitch);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+
+	cast_16bit_to_8bit(h_src, h_dst);
+	cast_16bit_to_8bit(d_src, d_dst);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
+
+TEST(CastTest, RandomU8ToU16)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_8U;
+	const ImageType dtype = SGM_16U;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype, pitch);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype, pitch);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+
+	cast_8bit_to_16bit(h_src, h_dst);
+	cast_8bit_to_16bit(d_src, d_dst);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
@@ -0,0 +1,231 @@
+#include <gtest/gtest.h>
+
+#include "host_image.h"
+#include "device_image.h"
+#include "test_utility.h"
+#include "internal.h"
+
+namespace sgm
+{
+
+template <typename T>
+static void census_transform_9x7_(const HostImage& src, HostImage& dst)
+{
+	constexpr int RADIUS_U = 9 / 2;
+	constexpr int RADIUS_V = 7 / 2;
+
+	dst.fill_zero();
+
+	for (int v = RADIUS_V; v < src.rows - RADIUS_V; v++) {
+		uint64_t* ptrDst = dst.ptr<uint64_t>(v);
+		for (int u = RADIUS_U; u < src.cols - RADIUS_U; u++) {
+			uint64_t f = 0;
+			for (int dv = -RADIUS_V; dv <= RADIUS_V; dv++) {
+				for (int du = -RADIUS_U; du <= RADIUS_U; du++) {
+					if (du != 0 && dv != 0) {
+						f <<= 1;
+						f |= (src.ptr<T>(v)[u] > src.ptr<T>(v + dv)[u + du]);
+					}
+				}
+			}
+			ptrDst[u] = f;
+		}
+	}
+}
+
+template <typename T>
+static void symmetric_census_9x7_(const HostImage& src, HostImage& dst)
+{
+	constexpr int RADIUS_U = 9 / 2;
+	constexpr int RADIUS_V = 7 / 2;
+
+	dst.fill_zero();
+
+	for (int v = RADIUS_V; v < src.rows - RADIUS_V; v++) {
+		uint32_t* ptrDst = dst.ptr<uint32_t>(v);
+		for (int u = RADIUS_U; u < src.cols - RADIUS_U; u++) {
+			uint32_t f = 0;
+			for (int dv = -RADIUS_V; dv <= 0; dv++) {
+				for (int du = -RADIUS_U; du <= (dv != 0 ? RADIUS_U : -1); du++) {
+					f <<= 1;
+					f |= (src.ptr<T>(v + dv)[u + du] > src.ptr<T>(v - dv)[u - du]);
+				}
+			}
+			ptrDst[u] = f;
+		}
+	}
+}
+
+void census_transform(const HostImage& src, HostImage& dst, CensusType type)
+{
+	if (type == CensusType::CENSUS_9x7) {
+		dst.create(src.rows, src.cols, SGM_64U);
+		if (src.type == SGM_8U)
+			census_transform_9x7_<uint8_t>(src, dst);
+		if (src.type == SGM_16U)
+			census_transform_9x7_<uint16_t>(src, dst);
+		if (src.type == SGM_32U)
+			census_transform_9x7_<uint32_t>(src, dst);
+	}
+	if (type == CensusType::SYMMETRIC_CENSUS_9x7) {
+		dst.create(src.rows, src.cols, SGM_32U);
+		if (src.type == SGM_8U)
+			symmetric_census_9x7_<uint8_t>(src, dst);
+		if (src.type == SGM_16U)
+			symmetric_census_9x7_<uint16_t>(src, dst);
+		if (src.type == SGM_32U)
+			symmetric_census_9x7_<uint32_t>(src, dst);
+	}
+}
+
+} // namespace sgm
+
+TEST(CensusTransformTest, RandomU8)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_8U;
+	const ImageType dtype = SGM_64U;
+	const CensusType censusType = CensusType::CENSUS_9x7;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+	d_dst.fill_zero();
+
+	census_transform(h_src, h_dst, censusType);
+	census_transform(d_src, d_dst, censusType);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
+
+TEST(CensusTransformTest, RandomU16)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_16U;
+	const ImageType dtype = SGM_64U;
+	const CensusType censusType = CensusType::CENSUS_9x7;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+	d_dst.fill_zero();
+
+	census_transform(h_src, h_dst, censusType);
+	census_transform(d_src, d_dst, censusType);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
+
+TEST(CensusTransformTest, RandomU32)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_32U;
+	const ImageType dtype = SGM_64U;
+	const CensusType censusType = CensusType::CENSUS_9x7;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+	d_dst.fill_zero();
+
+	census_transform(h_src, h_dst, censusType);
+	census_transform(d_src, d_dst, censusType);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
+
+TEST(SymmetricCensusTest, RandomU8)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_8U;
+	const ImageType dtype = SGM_32U;
+	const CensusType censusType = CensusType::SYMMETRIC_CENSUS_9x7;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+	d_dst.fill_zero();
+
+	census_transform(h_src, h_dst, censusType);
+	census_transform(d_src, d_dst, censusType);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
+
+TEST(SymmetricCensusTest, Random16U)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_16U;
+	const ImageType dtype = SGM_32U;
+	const CensusType censusType = CensusType::SYMMETRIC_CENSUS_9x7;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+	d_dst.fill_zero();
+
+	census_transform(h_src, h_dst, censusType);
+	census_transform(d_src, d_dst, censusType);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
+
+TEST(SymmetricCensusTest, Random32U)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_32U;
+	const ImageType dtype = SGM_32U;
+	const CensusType censusType = CensusType::SYMMETRIC_CENSUS_9x7;
+
+	HostImage h_src(h, w, stype, pitch), h_dst(h, w, dtype);
+	DeviceImage d_src(h, w, stype, pitch), d_dst(h, w, dtype);
+
+	random_fill(h_src);
+	d_src.upload(h_src.data);
+	d_dst.fill_zero();
+
+	census_transform(h_src, h_dst, censusType);
+	census_transform(d_src, d_dst, censusType);
+
+	EXPECT_TRUE(equals(h_dst, d_dst));
+}
@@ -0,0 +1,236 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+
+#include "host_image.h"
+#include "device_image.h"
+#include "test_utility.h"
+#include "internal.h"
+#include "constants.h"
+
+namespace sgm
+{
+
+template <typename SRC_T>
+static void check_consistency_(HostImage& dispL, const HostImage& dispR, const HostImage& srcL,
+	bool subpixel, int LR_max_diff)
+{
+	using DST_T = uint16_t;
+
+	const int h = srcL.rows;
+	const int w = srcL.cols;
+
+	for (int y = 0; y < h; y++)
+	{
+		const SRC_T* ptrMask = srcL.ptr<SRC_T>(y);
+		DST_T* ptrDispL = dispL.ptr<DST_T>(y);
+		const DST_T* ptrDispR = dispR.ptr<DST_T>(y);
+		for (int x = 0; x < w; x++)
+		{
+			const SRC_T mask = ptrMask[x];
+			const DST_T disp = ptrDispL[x];
+			int d = disp;
+			if (subpixel) {
+				d >>= sgm::StereoSGM::SUBPIXEL_SHIFT;
+			}
+			const int k = x - d;
+			if (mask == 0 || disp == sgm::INVALID_DISP ||
+				(k >= 0 && k < w && LR_max_diff >= 0 && abs(ptrDispR[k] - d) > LR_max_diff)) {
+				ptrDispL[x] = static_cast<DST_T>(sgm::INVALID_DISP);
+			}
+		}
+	}
+}
+
+void check_consistency(HostImage& dispL, const HostImage& dispR, const HostImage& srcL,
+	bool subpixel, int LR_max_diff)
+{
+	if (srcL.type == SGM_8U)
+		check_consistency_<uint8_t>(dispL, dispR, srcL, subpixel, LR_max_diff);
+	if (srcL.type == SGM_16U)
+		check_consistency_<uint16_t>(dispL, dispR, srcL, subpixel, LR_max_diff);
+	if (srcL.type == SGM_32U)
+		check_consistency_<uint32_t>(dispL, dispR, srcL, subpixel, LR_max_diff);
+}
+
+} // namespace sgm
+
+TEST(CheckConsistencyTest, RandomU8)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_8U;
+	const ImageType dtype = SGM_16U;
+	const int LR_max_diff = 5;
+	const bool subpixel = false;
+
+	HostImage h_srcL(h, w, stype, pitch), h_dispL(h, w, dtype, pitch), h_dispR(h, w, dtype, pitch);
+	DeviceImage d_srcL(h, w, stype, pitch), d_dispL(h, w, dtype, pitch), d_dispR(h, w, dtype, pitch);
+
+	random_fill(h_srcL);
+	random_fill(h_dispL);
+	random_fill(h_dispR);
+
+	d_srcL.upload(h_srcL.data);
+	d_dispL.upload(h_dispL.data);
+	d_dispR.upload(h_dispR.data);
+
+	check_consistency(h_dispL, h_dispR, h_srcL, subpixel, LR_max_diff);
+	check_consistency(d_dispL, d_dispR, d_srcL, subpixel, LR_max_diff);
+
+	EXPECT_TRUE(equals(h_dispL, d_dispL));
+}
+
+TEST(CheckConsistencyTest, RandomU16)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_16U;
+	const ImageType dtype = SGM_16U;
+	const int LR_max_diff = 5;
+	const bool subpixel = false;
+
+	HostImage h_srcL(h, w, stype, pitch), h_dispL(h, w, dtype, pitch), h_dispR(h, w, dtype, pitch);
+	DeviceImage d_srcL(h, w, stype, pitch), d_dispL(h, w, dtype, pitch), d_dispR(h, w, dtype, pitch);
+
+	random_fill(h_srcL);
+	random_fill(h_dispL);
+	random_fill(h_dispR);
+
+	d_srcL.upload(h_srcL.data);
+	d_dispL.upload(h_dispL.data);
+	d_dispR.upload(h_dispR.data);
+
+	check_consistency(h_dispL, h_dispR, h_srcL, subpixel, LR_max_diff);
+	check_consistency(d_dispL, d_dispR, d_srcL, subpixel, LR_max_diff);
+
+	EXPECT_TRUE(equals(h_dispL, d_dispL));
+}
+
+TEST(CheckConsistencyTest, RandomU32)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_32U;
+	const ImageType dtype = SGM_16U;
+	const int LR_max_diff = 5;
+	const bool subpixel = false;
+
+	HostImage h_srcL(h, w, stype, pitch), h_dispL(h, w, dtype, pitch), h_dispR(h, w, dtype, pitch);
+	DeviceImage d_srcL(h, w, stype, pitch), d_dispL(h, w, dtype, pitch), d_dispR(h, w, dtype, pitch);
+
+	random_fill(h_srcL);
+	random_fill(h_dispL);
+	random_fill(h_dispR);
+
+	d_srcL.upload(h_srcL.data);
+	d_dispL.upload(h_dispL.data);
+	d_dispR.upload(h_dispR.data);
+
+	check_consistency(h_dispL, h_dispR, h_srcL, subpixel, LR_max_diff);
+	check_consistency(d_dispL, d_dispR, d_srcL, subpixel, LR_max_diff);
+
+	EXPECT_TRUE(equals(h_dispL, d_dispL));
+}
+
+TEST(CheckConsistencyTest, RandomU8_Subpixel)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_8U;
+	const ImageType dtype = SGM_16U;
+	const int LR_max_diff = 5;
+	const bool subpixel = true;
+
+	HostImage h_srcL(h, w, stype, pitch), h_dispL(h, w, dtype, pitch), h_dispR(h, w, dtype, pitch);
+	DeviceImage d_srcL(h, w, stype, pitch), d_dispL(h, w, dtype, pitch), d_dispR(h, w, dtype, pitch);
+
+	random_fill(h_srcL);
+	random_fill(h_dispL);
+	random_fill(h_dispR);
+
+	d_srcL.upload(h_srcL.data);
+	d_dispL.upload(h_dispL.data);
+	d_dispR.upload(h_dispR.data);
+
+	check_consistency(h_dispL, h_dispR, h_srcL, subpixel, LR_max_diff);
+	check_consistency(d_dispL, d_dispR, d_srcL, subpixel, LR_max_diff);
+
+	EXPECT_TRUE(equals(h_dispL, d_dispL));
+}
+
+TEST(CheckConsistencyTest, RandomU16_Subpixel)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_16U;
+	const ImageType dtype = SGM_16U;
+	const int LR_max_diff = 5;
+	const bool subpixel = true;
+
+	HostImage h_srcL(h, w, stype, pitch), h_dispL(h, w, dtype, pitch), h_dispR(h, w, dtype, pitch);
+	DeviceImage d_srcL(h, w, stype, pitch), d_dispL(h, w, dtype, pitch), d_dispR(h, w, dtype, pitch);
+
+	random_fill(h_srcL);
+	random_fill(h_dispL);
+	random_fill(h_dispR);
+
+	d_srcL.upload(h_srcL.data);
+	d_dispL.upload(h_dispL.data);
+	d_dispR.upload(h_dispR.data);
+
+	check_consistency(h_dispL, h_dispR, h_srcL, subpixel, LR_max_diff);
+	check_consistency(d_dispL, d_dispR, d_srcL, subpixel, LR_max_diff);
+
+	EXPECT_TRUE(equals(h_dispL, d_dispL));
+}
+
+TEST(CheckConsistencyTest, RandomU32_Subpixel)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType stype = SGM_32U;
+	const ImageType dtype = SGM_16U;
+	const int LR_max_diff = 5;
+	const bool subpixel = true;
+
+	HostImage h_srcL(h, w, stype, pitch), h_dispL(h, w, dtype, pitch), h_dispR(h, w, dtype, pitch);
+	DeviceImage d_srcL(h, w, stype, pitch), d_dispL(h, w, dtype, pitch), d_dispR(h, w, dtype, pitch);
+
+	random_fill(h_srcL);
+	random_fill(h_dispL);
+	random_fill(h_dispR);
+
+	d_srcL.upload(h_srcL.data);
+	d_dispL.upload(h_dispL.data);
+	d_dispR.upload(h_dispR.data);
+
+	check_consistency(h_dispL, h_dispR, h_srcL, subpixel, LR_max_diff);
+	check_consistency(d_dispL, d_dispR, d_srcL, subpixel, LR_max_diff);
+
+	EXPECT_TRUE(equals(h_dispL, d_dispL));
+}
@@ -0,0 +1,73 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+
+#include "host_image.h"
+#include "device_image.h"
+#include "test_utility.h"
+#include "internal.h"
+#include "constants.h"
+
+namespace sgm
+{
+
+void correct_disparity_range(HostImage& disp, bool subpixel, int min_disp)
+{
+	const int h = disp.rows;
+	const int w = disp.cols;
+
+	const int scale = subpixel ? StereoSGM::SUBPIXEL_SCALE : 1;
+	const int     min_disp_scaled =  min_disp      * scale;
+	const int invalid_disp_scaled = (min_disp - 1) * scale;
+
+	for (int y = 0; y < h; y++)
+	{
+		uint16_t* ptrDisp = disp.ptr<uint16_t>(y);
+		for (int x = 0; x < w; x++)
+		{
+			uint16_t d = ptrDisp[x];
+			if (d == sgm::INVALID_DISP) {
+				d = invalid_disp_scaled;
+			}
+			else {
+				d += min_disp_scaled;
+			}
+			ptrDisp[x] = d;
+		}
+	}
+}
+
+} // namespace sgm
+
+using Parameters = std::tuple<int, int, int>;
+
+class CorrectDisparityRangeTest : public ::testing::TestWithParam<Parameters> {};
+INSTANTIATE_TEST_CASE_P(TestWithParams, CorrectDisparityRangeTest,
+	::testing::Combine(::testing::Values(64, 128, 256), ::testing::Values(0, 1), ::testing::Values(0, +16, -16)));
+
+TEST_P(CorrectDisparityRangeTest, Random16U)
+{
+	using namespace sgm;
+	using namespace details;
+
+	const int w = 631;
+	const int h = 479;
+	const int pitch = 640;
+	const ImageType dtype = SGM_16U;
+
+	const auto param = GetParam();
+	const int disp_size = std::get<0>(param);
+	const bool subpixel = std::get<1>(param) > 0;
+	const bool min_disp = std::get<2>(param);
+
+	HostImage h_disp(h, w, dtype, pitch);
+	DeviceImage d_disp(h, w, dtype, pitch);
+
+	random_fill(h_disp, 0, disp_size);
+	d_disp.upload(h_disp.data);
+
+	correct_disparity_range(h_disp, subpixel, min_disp);
+	correct_disparity_range(d_disp, subpixel, min_disp);
+
+	EXPECT_TRUE(equals(h_disp, d_disp));
+}
@@ -0,0 +1,185 @@
+#include <gtest/gtest.h>
+
+#include <algorithm>
+
+#include "host_image.h"
+#include "device_image.h"
+#include "test_utility.h"
+#include "internal.h"
+#include "constants.h"
+
+#ifdef _WIN32
+#define popcnt32 __popcnt
+#define popcnt64 __popcnt64
+#else
+#define popcnt32 __builtin_popcount
+#define popcnt64 __builtin_popcountll
+#endif
+
+struct CostAggregationParam
+{
+	sgm::ImageType census_type;
+	int disp_size;
+	int P1, P2;
+	int min_disp;
+};
+
+static CostAggregationParam params[] = {
+	{ sgm::SGM_32U,  64, 10, 120,  +0 },
+	{ sgm::SGM_32U,  64, 10, 120, +16 },
+	{ sgm::SGM_32U,  64, 10, 120, -16 },
+	{ sgm::SGM_32U, 128, 10, 120,  +0 },
+	{ sgm::SGM_32U, 128, 10, 120, +16 },
+	{ sgm::SGM_32U, 128, 10, 120, -16 },
+	{ sgm::SGM_32U, 256, 10, 120,  +0 },
+	{ sgm::SGM_32U, 256, 10, 120, +16 },
+	{ sgm::SGM_32U, 256, 10, 120, -16 },
+	{ sgm::SGM_64U,  64, 10, 120,  +0 },
+	{ sgm::SGM_64U,  64, 10, 120, +16 },
+	{ sgm::SGM_64U,  64, 10, 120, -16 },
+	{ sgm::SGM_64U, 128, 10, 120,  +0 },
+	{ sgm::SGM_64U, 128, 10, 120, +16 },
+	{ sgm::SGM_64U, 128, 10, 120, -16 },
+	{ sgm::SGM_64U, 256, 10, 120,  +0 },
+	{ sgm::SGM_64U, 256, 10, 120, +16 },
+	{ sgm::SGM_64U, 256, 10, 120, -16 },
+};
+
+namespace sgm
+{
+
+using COST_TYPE = uint8_t;
+
+static inline int HammingDistance(uint64_t c1, uint64_t c2) { return static_cast<int>(popcnt64(c1 ^ c2)); }
+static inline int HammingDistance(uint32_t c1, uint32_t c2) { return static_cast<int>(popcnt32(c1 ^ c2)); }
+
+static inline int min4(int x, int y, int z, int w)
+{
+	return std::min(std::min(x, y), std::min(z, w));
+};
+
+template <typename CENSUS_TYPE>
+static void cost_aggregation_(const HostImage& srcL, const HostImage& srcR, HostImage& dst,
+	int disp_size, int P1, int P2, int min_disp, int ru, int rv)
+{
+	const int h = srcL.rows;
+	const int w = srcL.cols;
+	const int n = disp_size;
+
+	const bool forward = rv > 0 || (rv == 0 && ru > 0);
+	int u0 = 0, u1 = w, du = 1, v0 = 0, v1 = h, dv = 1;
+	if (!forward) {
+		u0 = w - 1; u1 = -1; du = -1;
+		v0 = h - 1; v1 = -1; dv = -1;
+	}
+
+	std::vector<COST_TYPE> zero(disp_size, 0);
+
+	for (int vc = v0; vc != v1; vc += dv) {
+
+		const CENSUS_TYPE* censusL = srcL.ptr<CENSUS_TYPE>(vc);
+		const CENSUS_TYPE* censusR = srcR.ptr<CENSUS_TYPE>(vc);
+		for (int uc = u0; uc != u1; uc += du) {
+
+			const int vp = vc - rv;
+			const int up = uc - ru;
+			const bool inside = vp >= 0 && vp < h&& up >= 0 && up < w;
+
+			const CENSUS_TYPE cL = censusL[uc];
+			COST_TYPE* Lc = dst.ptr<COST_TYPE>(vc * w + uc);
+			COST_TYPE* Lp = inside ? dst.ptr<COST_TYPE>(vp * w + up) : zero.data();
+
+			COST_TYPE minLp = std::numeric_limits<COST_TYPE>::max();
+			for (int d = 0; d < n; d++)
+				minLp = std::min(minLp, Lp[d]);
+
+			const COST_TYPE _P1 = P1 - minLp;
+			for (int d = 0; d < n; d++) {
+				const int uR = uc - d - min_disp;
+				const CENSUS_TYPE cR = uR >= 0 && uR < w ? censusR[uR] : 0;
+				const COST_TYPE MC = HammingDistance(cL, cR);
+				const COST_TYPE Lp0 = Lp[d] - minLp;
+				const COST_TYPE Lp1 = d > 0 ?     Lp[d - 1] + _P1 : 0xFF;
+				const COST_TYPE Lp2 = d < n - 1 ? Lp[d + 1] + _P1 : 0xFF;
+				const COST_TYPE Lp3 = P2;
+				Lc[d] = static_cast<COST_TYPE>(MC + min4(Lp0, Lp1, Lp2, Lp3));
+			}
+		}
+	}
+}
+
+static void cost_aggregation(const HostImage& srcL, const HostImage& srcR, HostImage& dst,
+	int disp_size, int P1, int P2, int min_disp, int ru, int rv)
+{
+	if (srcL.type == SGM_32U)
+		cost_aggregation_<uint32_t>(srcL, srcR, dst, disp_size, P1, P2, min_disp, ru, rv);
+	if (srcL.type == SGM_64U)
+		cost_aggregation_<uint64_t>(srcL, srcR, dst, disp_size, P1, P2, min_disp, ru, rv);
+}
+
+void cost_aggregation(const HostImage& srcL, const HostImage& srcR, HostImage& dst,
+	int disp_size, int P1, int P2, PathType path_type, int min_disp)
+{
+	const int MAX_DIRECTIONS = 8;
+	const int ru[MAX_DIRECTIONS] = { +0, +0, +1, -1, +1, -1, -1, +1 };
+	const int rv[MAX_DIRECTIONS] = { +1, -1, +0, +0, +1, +1, -1, -1 };
+
+	const int w = srcL.cols;
+	const int h = srcL.rows;
+	const int num_paths = path_type == PathType::SCAN_4PATH ? 4 : 8;
+
+	dst.create(num_paths, h * w * disp_size, SGM_8U);
+
+	for (int i = 0; i < num_paths; i++)
+	{
+		HostImage cost(dst.ptr<COST_TYPE>(i), h * w, disp_size, SGM_8U);
+		cost_aggregation(srcL, srcR, cost, disp_size, P1, P2, min_disp, ru[i], rv[i]);
+	}
+}
+
+} // namespace sgm
+
+class CostAggregationTest : public ::testing::TestWithParam<CostAggregationParam> {};
+INSTANTIATE_TEST_CASE_P(TestWithParams, CostAggregationTest, ::testing::ValuesIn(params));
+
+TEST_P(CostAggregationTest, AllPathsTest)
+{
+	using namespace sgm;
+	using namespace details;
+
+	//GTEST_SKIP();
+
+	const auto param = GetParam();
+
+	const int w = 320;
+	const int h = 240;
+	const int disp_size = param.disp_size;
+	const auto path_type = PathType::SCAN_8PATH;
+	const int num_paths = path_type == PathType::SCAN_4PATH ? 4 : 8;
+	const int P1 = param.P1;
+	const int P2 = param.P2;
+	const int min_disp = param.min_disp;
+
+	const ImageType census_type = param.census_type;
+	const ImageType cost_type = SGM_8U;
+
+	HostImage h_censusL(h, w, census_type), h_censusR(h, w, census_type);
+	HostImage h_costs;
+
+	DeviceImage d_censusL(h, w, census_type), d_censusR(h, w, census_type);
+	DeviceImage d_costs;
+
+	random_fill(h_censusL);
+	random_fill(h_censusR);
+	d_censusL.upload(h_censusL.data);
+	d_censusR.upload(h_censusR.data);
+
+	cost_aggregation(h_censusL, h_censusR, h_costs, disp_size, P1, P2, path_type, min_disp);
+	cost_aggregation(d_censusL, d_censusR, d_costs, disp_size, P1, P2, path_type, min_disp);
+
+	for (int i = 0; i < num_paths; i++) {
+		HostImage h_cost(h_costs.ptr<COST_TYPE>(i), h * w, disp_size, cost_type);
+		DeviceImage d_cost(d_costs.ptr<COST_TYPE>(i), h * w, disp_size, cost_type);
+		EXPECT_TRUE(equals(h_cost, d_cost));
+	}
+}
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
@@ -0,0 +1,43 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: 'bug'
+assignees: ''
+---
+
+**Describe the bug**
+
+Include a clear and concise description of what the problem is, including what
+you expected to happen, and what actually happened.
+
+**Steps to reproduce the bug**
+
+It's important that we are able to reproduce the problem that you are
+experiencing. Please provide all code and relevant steps to reproduce the
+problem, including your `BUILD`/`CMakeLists.txt` file and build commands. Links
+to a GitHub branch or [godbolt.org](https://godbolt.org/) that demonstrate the
+problem are also helpful.
+
+**Does the bug persist in the most recent commit?**
+
+We recommend using the latest commit in the master branch in your projects.
+
+**What operating system and version are you using?**
+
+If you are using a Linux distribution please include the name and version of the
+distribution as well.
+
+**What compiler and version are you using?**
+
+Please include the output of `gcc -v` or `clang -v`, or the equivalent for your
+compiler.
+
+**What build system are you using?**
+
+Please include the output of `bazel --version` or `cmake --version`, or the
+equivalent for your build system.
+
+**Additional context**
+
+Add any other context about the problem here.
@@ -0,0 +1,24 @@
+---
+name: Feature request
+about: Propose a new feature
+title: ''
+labels: 'enhancement'
+assignees: ''
+---
+
+**Does the feature exist in the most recent commit?**
+
+We recommend using the latest commit from GitHub in your projects.
+
+**Why do we need this feature?**
+
+Ideally, explain why a combination of existing features cannot be used instead.
+
+**Describe the proposal**
+
+Include a detailed description of the feature, with usage examples.
+
+**Is the feature specific to an operating system, compiler, or build system version?**
+
+If it is, please specify which versions.
+
@@ -0,0 +1 @@
+blank_issues_enabled: false
@@ -0,0 +1,40 @@
+name: ci
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  Linux:
+    runs-on: ubuntu-latest
+    steps:
+
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Tests
+      run: bazel test --test_output=errors //...
+
+  MacOs:
+    runs-on: macos-latest
+    steps:
+
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Tests
+      run: bazel test --test_output=errors //...
+
+
+  Windows:
+    runs-on: windows-latest
+    steps:
+
+    - uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+
+    - name: Tests
+      run: bazel test --test_output=errors //...
@@ -0,0 +1,84 @@
+# Ignore CI build directory
+build/
+xcuserdata
+cmake-build-debug/
+.idea/
+bazel-bin
+bazel-genfiles
+bazel-googletest
+bazel-out
+bazel-testlogs
+# python
+*.pyc
+
+# Visual Studio files
+.vs
+*.sdf
+*.opensdf
+*.VC.opendb
+*.suo
+*.user
+_ReSharper.Caches/
+Win32-Debug/
+Win32-Release/
+x64-Debug/
+x64-Release/
+
+# Ignore autoconf / automake files
+Makefile.in
+aclocal.m4
+configure
+build-aux/
+autom4te.cache/
+googletest/m4/libtool.m4
+googletest/m4/ltoptions.m4
+googletest/m4/ltsugar.m4
+googletest/m4/ltversion.m4
+googletest/m4/lt~obsolete.m4
+googlemock/m4
+
+# Ignore generated directories.
+googlemock/fused-src/
+googletest/fused-src/
+
+# macOS files
+.DS_Store
+googletest/.DS_Store
+googletest/xcode/.DS_Store
+
+# Ignore cmake generated directories and files.
+CMakeFiles
+CTestTestfile.cmake
+Makefile
+cmake_install.cmake
+googlemock/CMakeFiles
+googlemock/CTestTestfile.cmake
+googlemock/Makefile
+googlemock/cmake_install.cmake
+googlemock/gtest
+/bin
+/googlemock/gmock.dir
+/googlemock/gmock_main.dir
+/googlemock/RUN_TESTS.vcxproj.filters
+/googlemock/RUN_TESTS.vcxproj
+/googlemock/INSTALL.vcxproj.filters
+/googlemock/INSTALL.vcxproj
+/googlemock/gmock_main.vcxproj.filters
+/googlemock/gmock_main.vcxproj
+/googlemock/gmock.vcxproj.filters
+/googlemock/gmock.vcxproj
+/googlemock/gmock.sln
+/googlemock/ALL_BUILD.vcxproj.filters
+/googlemock/ALL_BUILD.vcxproj
+/lib
+/Win32
+/ZERO_CHECK.vcxproj.filters
+/ZERO_CHECK.vcxproj
+/RUN_TESTS.vcxproj.filters
+/RUN_TESTS.vcxproj
+/INSTALL.vcxproj.filters
+/INSTALL.vcxproj
+/googletest-distribution.sln
+/CMakeCache.txt
+/ALL_BUILD.vcxproj.filters
+/ALL_BUILD.vcxproj
@@ -0,0 +1,218 @@
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#   Bazel Build for Google C++ Testing Framework(Google Test)
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+config_setting(
+    name = "qnx",
+    constraint_values = ["@platforms//os:qnx"],
+)
+
+config_setting(
+    name = "windows",
+    constraint_values = ["@platforms//os:windows"],
+)
+
+config_setting(
+    name = "freebsd",
+    constraint_values = ["@platforms//os:freebsd"],
+)
+
+config_setting(
+    name = "openbsd",
+    constraint_values = ["@platforms//os:openbsd"],
+)
+
+config_setting(
+    name = "msvc_compiler",
+    flag_values = {
+        "@bazel_tools//tools/cpp:compiler": "msvc-cl",
+    },
+    visibility = [":__subpackages__"],
+)
+
+config_setting(
+    name = "has_absl",
+    values = {"define": "absl=1"},
+)
+
+# Library that defines the FRIEND_TEST macro.
+cc_library(
+    name = "gtest_prod",
+    hdrs = ["googletest/include/gtest/gtest_prod.h"],
+    includes = ["googletest/include"],
+)
+
+# Google Test including Google Mock
+cc_library(
+    name = "gtest",
+    srcs = glob(
+        include = [
+            "googletest/src/*.cc",
+            "googletest/src/*.h",
+            "googletest/include/gtest/**/*.h",
+            "googlemock/src/*.cc",
+            "googlemock/include/gmock/**/*.h",
+        ],
+        exclude = [
+            "googletest/src/gtest-all.cc",
+            "googletest/src/gtest_main.cc",
+            "googlemock/src/gmock-all.cc",
+            "googlemock/src/gmock_main.cc",
+        ],
+    ),
+    hdrs = glob([
+        "googletest/include/gtest/*.h",
+        "googlemock/include/gmock/*.h",
+    ]),
+    copts = select({
+        ":qnx": [],
+        ":windows": [],
+        "//conditions:default": ["-pthread"],
+    }),
+    defines = select({
+        ":has_absl": ["GTEST_HAS_ABSL=1"],
+        "//conditions:default": [],
+    }),
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    includes = [
+        "googlemock",
+        "googlemock/include",
+        "googletest",
+        "googletest/include",
+    ],
+    linkopts = select({
+        ":qnx": ["-lregex"],
+        ":windows": [],
+        ":freebsd": [
+            "-lm",
+            "-pthread",
+        ],
+        ":openbsd": [
+            "-lm",
+            "-pthread",
+        ],
+        "//conditions:default": ["-pthread"],
+    }),
+    deps = select({
+        ":has_absl": [
+            "@com_google_absl//absl/debugging:failure_signal_handler",
+            "@com_google_absl//absl/debugging:stacktrace",
+            "@com_google_absl//absl/debugging:symbolize",
+            "@com_google_absl//absl/flags:flag",
+            "@com_google_absl//absl/flags:parse",
+            "@com_google_absl//absl/flags:reflection",
+            "@com_google_absl//absl/flags:usage",
+            "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/types:any",
+            "@com_google_absl//absl/types:optional",
+            "@com_google_absl//absl/types:variant",
+            "@com_googlesource_code_re2//:re2",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "gtest_main",
+    srcs = ["googlemock/src/gmock_main.cc"],
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+    deps = [":gtest"],
+)
+
+# The following rules build samples of how to use gTest.
+cc_library(
+    name = "gtest_sample_lib",
+    srcs = [
+        "googletest/samples/sample1.cc",
+        "googletest/samples/sample2.cc",
+        "googletest/samples/sample4.cc",
+    ],
+    hdrs = [
+        "googletest/samples/prime_tables.h",
+        "googletest/samples/sample1.h",
+        "googletest/samples/sample2.h",
+        "googletest/samples/sample3-inl.h",
+        "googletest/samples/sample4.h",
+    ],
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_test(
+    name = "gtest_samples",
+    size = "small",
+    # All Samples except:
+    #   sample9 (main)
+    #   sample10 (main and takes a command line option and needs to be separate)
+    srcs = [
+        "googletest/samples/sample1_unittest.cc",
+        "googletest/samples/sample2_unittest.cc",
+        "googletest/samples/sample3_unittest.cc",
+        "googletest/samples/sample4_unittest.cc",
+        "googletest/samples/sample5_unittest.cc",
+        "googletest/samples/sample6_unittest.cc",
+        "googletest/samples/sample7_unittest.cc",
+        "googletest/samples/sample8_unittest.cc",
+    ],
+    linkstatic = 0,
+    deps = [
+        "gtest_sample_lib",
+        ":gtest_main",
+    ],
+)
+
+cc_test(
+    name = "sample9_unittest",
+    size = "small",
+    srcs = ["googletest/samples/sample9_unittest.cc"],
+    deps = [":gtest"],
+)
+
+cc_test(
+    name = "sample10_unittest",
+    size = "small",
+    srcs = ["googletest/samples/sample10_unittest.cc"],
+    deps = [":gtest"],
+)
@@ -0,0 +1,34 @@
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+
+cmake_minimum_required(VERSION 3.5)
+
+if (POLICY CMP0048)
+  cmake_policy(SET CMP0048 NEW)
+endif (POLICY CMP0048)
+
+if (POLICY CMP0077)
+  cmake_policy(SET CMP0077 NEW)
+endif (POLICY CMP0077)
+
+project(googletest-distribution)
+set(GOOGLETEST_VERSION 1.12.1)
+
+if(NOT CYGWIN AND NOT MSYS AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL QNX)
+  set(CMAKE_CXX_EXTENSIONS OFF)
+endif()
+
+enable_testing()
+
+include(CMakeDependentOption)
+include(GNUInstallDirs)
+
+#Note that googlemock target already builds googletest
+option(BUILD_GMOCK "Builds the googlemock subproject" ON)
+option(INSTALL_GTEST "Enable installation of googletest. (Projects embedding googletest may want to turn this OFF.)" ON)
+
+if(BUILD_GMOCK)
+  add_subdirectory( googlemock )
+else()
+  add_subdirectory( googletest )
+endif()
@@ -0,0 +1,131 @@
+# How to become a contributor and submit your own code
+
+## Contributor License Agreements
+
+We'd love to accept your patches! Before we can take them, we have to jump a
+couple of legal hurdles.
+
+Please fill out either the individual or corporate Contributor License Agreement
+(CLA).
+
+*   If you are an individual writing original source code and you're sure you
+    own the intellectual property, then you'll need to sign an
+    [individual CLA](https://developers.google.com/open-source/cla/individual).
+*   If you work for a company that wants to allow you to contribute your work,
+    then you'll need to sign a
+    [corporate CLA](https://developers.google.com/open-source/cla/corporate).
+
+Follow either of the two links above to access the appropriate CLA and
+instructions for how to sign and return it. Once we receive it, we'll be able to
+accept your pull requests.
+
+## Are you a Googler?
+
+If you are a Googler, please make an attempt to submit an internal contribution
+rather than a GitHub Pull Request. If you are not able to submit internally, a
+PR is acceptable as an alternative.
+
+## Contributing A Patch
+
+1.  Submit an issue describing your proposed change to the
+    [issue tracker](https://github.com/google/googletest/issues).
+2.  Please don't mix more than one logical change per submittal, because it
+    makes the history hard to follow. If you want to make a change that doesn't
+    have a corresponding issue in the issue tracker, please create one.
+3.  Also, coordinate with team members that are listed on the issue in question.
+    This ensures that work isn't being duplicated and communicating your plan
+    early also generally leads to better patches.
+4.  If your proposed change is accepted, and you haven't already done so, sign a
+    Contributor License Agreement
+    ([see details above](#contributor-license-agreements)).
+5.  Fork the desired repo, develop and test your code changes.
+6.  Ensure that your code adheres to the existing style in the sample to which
+    you are contributing.
+7.  Ensure that your code has an appropriate set of unit tests which all pass.
+8.  Submit a pull request.
+
+## The Google Test and Google Mock Communities
+
+The Google Test community exists primarily through the
+[discussion group](http://groups.google.com/group/googletestframework) and the
+GitHub repository. Likewise, the Google Mock community exists primarily through
+their own [discussion group](http://groups.google.com/group/googlemock). You are
+definitely encouraged to contribute to the discussion and you can also help us
+to keep the effectiveness of the group high by following and promoting the
+guidelines listed here.
+
+### Please Be Friendly
+
+Showing courtesy and respect to others is a vital part of the Google culture,
+and we strongly encourage everyone participating in Google Test development to
+join us in accepting nothing less. Of course, being courteous is not the same as
+failing to constructively disagree with each other, but it does mean that we
+should be respectful of each other when enumerating the 42 technical reasons
+that a particular proposal may not be the best choice. There's never a reason to
+be antagonistic or dismissive toward anyone who is sincerely trying to
+contribute to a discussion.
+
+Sure, C++ testing is serious business and all that, but it's also a lot of fun.
+Let's keep it that way. Let's strive to be one of the friendliest communities in
+all of open source.
+
+As always, discuss Google Test in the official GoogleTest discussion group. You
+don't have to actually submit code in order to sign up. Your participation
+itself is a valuable contribution.
+
+## Style
+
+To keep the source consistent, readable, diffable and easy to merge, we use a
+fairly rigid coding style, as defined by the
+[google-styleguide](https://github.com/google/styleguide) project. All patches
+will be expected to conform to the style outlined
+[here](https://google.github.io/styleguide/cppguide.html). Use
+[.clang-format](https://github.com/google/googletest/blob/master/.clang-format)
+to check your formatting.
+
+## Requirements for Contributors
+
+If you plan to contribute a patch, you need to build Google Test, Google Mock,
+and their own tests from a git checkout, which has further requirements:
+
+*   [Python](https://www.python.org/) v2.3 or newer (for running some of the
+    tests and re-generating certain source files from templates)
+*   [CMake](https://cmake.org/) v2.8.12 or newer
+
+## Developing Google Test and Google Mock
+
+This section discusses how to make your own changes to the Google Test project.
+
+### Testing Google Test and Google Mock Themselves
+
+To make sure your changes work as intended and don't break existing
+functionality, you'll want to compile and run Google Test and GoogleMock's own
+tests. For that you can use CMake:
+
+    mkdir mybuild
+    cd mybuild
+    cmake -Dgtest_build_tests=ON -Dgmock_build_tests=ON ${GTEST_REPO_DIR}
+
+To choose between building only Google Test or Google Mock, you may modify your
+cmake command to be one of each
+
+    cmake -Dgtest_build_tests=ON ${GTEST_DIR} # sets up Google Test tests
+    cmake -Dgmock_build_tests=ON ${GMOCK_DIR} # sets up Google Mock tests
+
+Make sure you have Python installed, as some of Google Test's tests are written
+in Python. If the cmake command complains about not being able to find Python
+(`Could NOT find PythonInterp (missing: PYTHON_EXECUTABLE)`), try telling it
+explicitly where your Python executable can be found:
+
+    cmake -DPYTHON_EXECUTABLE=path/to/python ...
+
+Next, you can build Google Test and / or Google Mock and all desired tests. On
+\*nix, this is usually done by
+
+    make
+
+To run the tests, do
+
+    make test
+
+All tests should pass.
@@ -0,0 +1,65 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Testing Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Ajay Joshi <jaj@google.com>
+Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
+Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
+Chandler Carruth <chandlerc@google.com>
+Chris Prince <cprince@google.com>
+Chris Taylor <taylorc@google.com>
+Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
+Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
+Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
+Jói Sigurðsson <joi@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
+Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
+Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
+Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
+Matthew Simmons <simmonmt@acm.org>
+Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
+Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
+Pasi Valminen <pasi.valminen@gmail.com>
+Patrick Hanna <phanna@google.com>
+Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
+Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
+Preston Jackson <preston.a.jackson@gmail.com>
+Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
+Russ Cox <rsc@google.com>
+Russ Rufer <russ@pentad.com>
+Sean Mcafee <eefacm@gmail.com>
+Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
+Tracy Bialik <tracy@pentad.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
+Zhanyong Wan <wan@google.com>
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,141 @@
+# GoogleTest
+
+### Announcements
+
+#### Live at Head
+
+GoogleTest now follows the
+[Abseil Live at Head philosophy](https://abseil.io/about/philosophy#upgrade-support).
+We recommend
+[updating to the latest commit in the `main` branch as often as possible](https://github.com/abseil/abseil-cpp/blob/master/FAQ.md#what-is-live-at-head-and-how-do-i-do-it).
+
+#### Documentation Updates
+
+Our documentation is now live on GitHub Pages at
+https://google.github.io/googletest/. We recommend browsing the documentation on
+GitHub Pages rather than directly in the repository.
+
+#### Release 1.11.0
+
+[Release 1.11.0](https://github.com/google/googletest/releases/tag/release-1.11.0)
+is now available.
+
+#### Coming Soon
+
+*   We are planning to take a dependency on
+    [Abseil](https://github.com/abseil/abseil-cpp).
+*   More documentation improvements are planned.
+
+## Welcome to **GoogleTest**, Google's C++ test framework!
+
+This repository is a merger of the formerly separate GoogleTest and GoogleMock
+projects. These were so closely related that it makes sense to maintain and
+release them together.
+
+### Getting Started
+
+See the [GoogleTest User's Guide](https://google.github.io/googletest/) for
+documentation. We recommend starting with the
+[GoogleTest Primer](https://google.github.io/googletest/primer.html).
+
+More information about building GoogleTest can be found at
+[googletest/README.md](googletest/README.md).
+
+## Features
+
+*   An [xUnit](https://en.wikipedia.org/wiki/XUnit) test framework.
+*   Test discovery.
+*   A rich set of assertions.
+*   User-defined assertions.
+*   Death tests.
+*   Fatal and non-fatal failures.
+*   Value-parameterized tests.
+*   Type-parameterized tests.
+*   Various options for running the tests.
+*   XML test report generation.
+
+## Supported Platforms
+
+GoogleTest requires a codebase and compiler compliant with the C++11 standard or
+newer.
+
+The GoogleTest code is officially supported on the following platforms.
+Operating systems or tools not listed below are community-supported. For
+community-supported platforms, patches that do not complicate the code may be
+considered.
+
+If you notice any problems on your platform, please file an issue on the
+[GoogleTest GitHub Issue Tracker](https://github.com/google/googletest/issues).
+Pull requests containing fixes are welcome!
+
+### Operating Systems
+
+*   Linux
+*   macOS
+*   Windows
+
+### Compilers
+
+*   gcc 5.0+
+*   clang 5.0+
+*   MSVC 2015+
+
+**macOS users:** Xcode 9.3+ provides clang 5.0+.
+
+### Build Systems
+
+*   [Bazel](https://bazel.build/)
+*   [CMake](https://cmake.org/)
+
+**Note:** Bazel is the build system used by the team internally and in tests.
+CMake is supported on a best-effort basis and by the community.
+
+## Who Is Using GoogleTest?
+
+In addition to many internal projects at Google, GoogleTest is also used by the
+following notable projects:
+
+*   The [Chromium projects](http://www.chromium.org/) (behind the Chrome browser
+    and Chrome OS).
+*   The [LLVM](http://llvm.org/) compiler.
+*   [Protocol Buffers](https://github.com/google/protobuf), Google's data
+    interchange format.
+*   The [OpenCV](http://opencv.org/) computer vision library.
+
+## Related Open Source Projects
+
+[GTest Runner](https://github.com/nholthaus/gtest-runner) is a Qt5 based
+automated test-runner and Graphical User Interface with powerful features for
+Windows and Linux platforms.
+
+[GoogleTest UI](https://github.com/ospector/gtest-gbar) is a test runner that
+runs your test binary, allows you to track its progress via a progress bar, and
+displays a list of test failures. Clicking on one shows failure text. GoogleTest
+UI is written in C#.
+
+[GTest TAP Listener](https://github.com/kinow/gtest-tap-listener) is an event
+listener for GoogleTest that implements the
+[TAP protocol](https://en.wikipedia.org/wiki/Test_Anything_Protocol) for test
+result output. If your test runner understands TAP, you may find it useful.
+
+[gtest-parallel](https://github.com/google/gtest-parallel) is a test runner that
+runs tests from your binary in parallel to provide significant speed-up.
+
+[GoogleTest Adapter](https://marketplace.visualstudio.com/items?itemName=DavidSchuldenfrei.gtest-adapter)
+is a VS Code extension allowing to view GoogleTest in a tree view and run/debug
+your tests.
+
+[C++ TestMate](https://github.com/matepek/vscode-catch2-test-adapter) is a VS
+Code extension allowing to view GoogleTest in a tree view and run/debug your
+tests.
+
+[Cornichon](https://pypi.org/project/cornichon/) is a small Gherkin DSL parser
+that generates stub code for GoogleTest.
+
+## Contributing Changes
+
+Please read
+[`CONTRIBUTING.md`](https://github.com/google/googletest/blob/master/CONTRIBUTING.md)
+for details on how to contribute to this project.
+
+Happy testing!
@@ -0,0 +1,39 @@
+workspace(name = "com_google_googletest")
+
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+http_archive(
+    name = "com_google_absl",
+    sha256 = "1a1745b5ee81392f5ea4371a4ca41e55d446eeaee122903b2eaffbd8a3b67a2b",
+    strip_prefix = "abseil-cpp-01cc6567cff77738e416a7ddc17de2d435a780ce",
+    urls = ["https://github.com/abseil/abseil-cpp/archive/01cc6567cff77738e416a7ddc17de2d435a780ce.zip"],  # 2022-06-21T19:28:27Z
+)
+
+# Note this must use a commit from the `abseil` branch of the RE2 project.
+# https://github.com/google/re2/tree/abseil
+http_archive(
+    name = "com_googlesource_code_re2",
+    sha256 = "0a890c2aa0bb05b2ce906a15efb520d0f5ad4c7d37b8db959c43772802991887",
+    strip_prefix = "re2-a427f10b9fb4622dd6d8643032600aa1b50fbd12",
+    urls = ["https://github.com/google/re2/archive/a427f10b9fb4622dd6d8643032600aa1b50fbd12.zip"],  # 2022-06-09
+)
+
+http_archive(
+    name = "rules_python",
+    sha256 = "0b460f17771258341528753b1679335b629d1d25e3af28eda47d009c103a6e15",
+    strip_prefix = "rules_python-aef17ad72919d184e5edb7abf61509eb78e57eda",
+    urls = ["https://github.com/bazelbuild/rules_python/archive/aef17ad72919d184e5edb7abf61509eb78e57eda.zip"],  # 2022-06-21T23:44:47Z
+)
+
+http_archive(
+    name = "bazel_skylib",
+    urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/1.2.1/bazel-skylib-1.2.1.tar.gz"],
+    sha256 = "f7be3474d42aae265405a592bb7da8e171919d74c16f082a5457840f06054728",
+)
+
+http_archive(
+    name = "platforms",
+    sha256 = "a879ea428c6d56ab0ec18224f976515948822451473a80d06c2e50af0bbe5121",
+    strip_prefix = "platforms-da5541f26b7de1dc8e04c075c99df5351742a4a2",
+    urls = ["https://github.com/bazelbuild/platforms/archive/da5541f26b7de1dc8e04c075c99df5351742a4a2.zip"],  # 2022-05-27
+)
@@ -0,0 +1,130 @@
+#!/bin/bash
+#
+# Copyright 2020, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+set -euox pipefail
+
+readonly LINUX_LATEST_CONTAINER="gcr.io/google.com/absl-177019/linux_hybrid-latest:20220217"
+readonly LINUX_GCC_FLOOR_CONTAINER="gcr.io/google.com/absl-177019/linux_gcc-floor:20220621"
+
+if [[ -z ${GTEST_ROOT:-} ]]; then
+  GTEST_ROOT="$(realpath $(dirname ${0})/..)"
+fi
+
+if [[ -z ${STD:-} ]]; then
+  STD="c++11 c++14 c++17 c++20"
+fi
+
+# Test the CMake build
+for cc in /usr/local/bin/gcc /opt/llvm/clang/bin/clang; do
+  for cmake_off_on in OFF ON; do
+    time docker run \
+      --volume="${GTEST_ROOT}:/src:ro" \
+      --tmpfs="/build:exec" \
+      --workdir="/build" \
+      --rm \
+      --env="CC=${cc}" \
+      --env="CXX_FLAGS=\"-Werror -Wdeprecated\"" \
+      ${LINUX_LATEST_CONTAINER} \
+      /bin/bash -c "
+        cmake /src \
+          -DCMAKE_CXX_STANDARD=11 \
+          -Dgtest_build_samples=ON \
+          -Dgtest_build_tests=ON \
+          -Dgmock_build_tests=ON \
+          -Dcxx_no_exception=${cmake_off_on} \
+          -Dcxx_no_rtti=${cmake_off_on} && \
+        make -j$(nproc) && \
+        ctest -j$(nproc) --output-on-failure"
+  done
+done
+
+# Do one test with an older version of GCC
+time docker run \
+  --volume="${GTEST_ROOT}:/src:ro" \
+  --workdir="/src" \
+  --rm \
+  --env="CC=/usr/local/bin/gcc" \
+  ${LINUX_GCC_FLOOR_CONTAINER} \
+    /usr/local/bin/bazel test ... \
+      --copt="-Wall" \
+      --copt="-Werror" \
+      --copt="-Wuninitialized" \
+      --copt="-Wno-error=pragmas" \
+      --distdir="/bazel-distdir" \
+      --keep_going \
+      --show_timestamps \
+      --test_output=errors
+
+# Test GCC
+for std in ${STD}; do
+  for absl in 0 1; do
+    time docker run \
+      --volume="${GTEST_ROOT}:/src:ro" \
+      --workdir="/src" \
+      --rm \
+      --env="CC=/usr/local/bin/gcc" \
+      --env="BAZEL_CXXOPTS=-std=${std}" \
+      ${LINUX_LATEST_CONTAINER} \
+      /usr/local/bin/bazel test ... \
+        --copt="-Wall" \
+        --copt="-Werror" \
+        --copt="-Wuninitialized" \
+        --define="absl=${absl}" \
+        --distdir="/bazel-distdir" \
+        --keep_going \
+        --show_timestamps \
+        --test_output=errors
+  done
+done
+
+# Test Clang
+for std in ${STD}; do
+  for absl in 0 1; do
+    time docker run \
+      --volume="${GTEST_ROOT}:/src:ro" \
+      --workdir="/src" \
+      --rm \
+      --env="CC=/opt/llvm/clang/bin/clang" \
+      --env="BAZEL_CXXOPTS=-std=${std}" \
+      ${LINUX_LATEST_CONTAINER} \
+      /usr/local/bin/bazel test ... \
+        --copt="--gcc-toolchain=/usr/local" \
+        --copt="-Wall" \
+        --copt="-Werror" \
+        --copt="-Wuninitialized" \
+        --define="absl=${absl}" \
+        --distdir="/bazel-distdir" \
+        --keep_going \
+        --linkopt="--gcc-toolchain=/usr/local" \
+        --show_timestamps \
+        --test_output=errors
+  done
+done
--- a/Show More
+++ b/Show More