Script Bug Review

2025-02-20 22:22 UTC o3-mini-high Open in ChatGPT ↗

check this script for bugs

import tyro import os import glob from tqdm.auto import tqdm

from promptda.utils.io_wrapper import load_image, load_depth, save_depth from promptda.utils.parallel_utils import parallel_execution from promptda.promptda import PromptDA import trimesh

def load_data(input_path: str, max_size: int): root_dir = os.path.dirname(input_path) scene_name = input_path.split(’/’)[-1].split(’.’)[0] input_dir = os.path.join(root_dir, scene_name) if not os.path.exists(input_dir): cmd = f’unzip -o {input_path} -d {root_dir}’ os.system(cmd)

if not os.path.exists(os.path.join(input_dir, 'rgb')):
    os.makedirs(os.path.join(input_dir, 'rgb'), exist_ok=True)
    cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {input_dir}/rgb/%06d.jpg'
    os.system(cmd)

rgb_files = sorted(glob.glob(os.path.join(input_dir, 'rgb', '*.jpg')))
prompt_depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))

if len(rgb_files) != len(prompt_depth_files):
    min_len = min(len(rgb_files), len(prompt_depth_files))
    rgb_files = rgb_files[:min_len]
    prompt_depth_files = prompt_depth_files[:min_len]

rgbs = parallel_execution(rgb_files,
                          to_tensor=True, # to_tensor
                          max_size=max_size,
                          action=load_image,
                          num_processes=32,
                          print_progress=True,
                          desc='Loading RGB images')

prompt_depths = parallel_execution(prompt_depth_files,
                          to_tensor=True, # to_tensor
                          action=load_depth,
                          num_processes=32,
                          print_progress=True,
                          desc='Loading Prompt Depth')
return rgbs, prompt_depths

def main(input_path: str = ‘data/8b98276b0a.zip’, output_path: str = ‘data/8b98276b0a_results’, max_size: int = 1008, ): os.makedirs(output_path, exist_ok=True) rgbs, prompt_depths = load_data(input_path, max_size)

ixt_path = os.path.join(input_dir, f'camera_matrix.csv')
ixt = np.loadtxt(ixt_path, delimiter=',')
orig_max = 1920

results = []
DEVICE = 'cuda'
model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()
for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
    rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
    depth = model.predict(rgb, prompt_depth)
    save_depth(depth.detach().cpu(),
               output_path=os.path.join(output_path, f'{frame_idx:06d}.png'),
               save_vis=True)



    now_max = max(rgb.shape[1], rgb.shape[0])
    scale = orig_max / now_max
    ixt[:2] = ixt[:2] / scale
    points, colors = unproject_depth(depth, ixt=ixt, color=rgb, ret_pcd=False)
    pcd = trimesh.PointCloud(vertices=points, colors=colors)
    ply_path = os.path.join(input_dir, f'pointcloud.ply')
    pcd.export(ply_path)

if name == “main”: tyro.cli(main)

Below are the issues I spotted in your script along with some recommendations:

1. Undefined `input_dir` in `main`

Issue: In the main function you use input_dir when constructing paths for the camera matrix and point cloud export. However, input_dir is defined only in the load_data function and is not passed back or re-computed in main.
Recommendation: Either return input_dir from load_data or recompute it in main (using logic similar to what you did in load_data).

Example fix:

def load_data(input_path: str, max_size: int):
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)
    # ... rest of your code ...
    return rgbs, prompt_depths, input_dir

def main(...):
    # ...
    rgbs, prompt_depths, input_dir = load_data(input_path, max_size)
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    # ...

2. Missing `numpy` Import

Issue: The script uses np.loadtxt to load the camera matrix, but there is no import statement for numpy.
Recommendation: Add import numpy as np at the top of your script.

3. Undefined Function `unproject_depth`

Issue: The function unproject_depth is used in the loop, but it isn’t imported or defined anywhere in your script.
Recommendation: Make sure to import it from the correct module (e.g., if it’s in a utility module, add the appropriate import). For example:
```
from promptda.utils.depth_utils import unproject_depth
```
(Adjust the import path based on your project structure.)

4. Modifying the Camera Intrinsics (`ixt`) In-Place on Each Iteration

Issue: Inside your loop you update ixt[:2] = ixt[:2] / scale for every frame. This means that on subsequent iterations, ixt is being scaled repeatedly, which may lead to incorrect values.
Recommendation: Instead of modifying ixt directly, create a copy for each frame’s computation.

Example fix:

for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
    rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
    depth = model.predict(rgb, prompt_depth)
    save_depth(depth.detach().cpu(),
               output_path=os.path.join(output_path, f'{frame_idx:06d}.png'),
               save_vis=True)

    now_max = max(rgb.shape[1], rgb.shape[0])
    scale = orig_max / now_max
    ixt_frame = ixt.copy()
    ixt_frame[:2] = ixt_frame[:2] / scale

    points, colors = unproject_depth(depth, ixt=ixt_frame, color=rgb, ret_pcd=False)
    pcd = trimesh.PointCloud(vertices=points, colors=colors)
    ply_path = os.path.join(input_dir, 'pointcloud.ply')
    pcd.export(ply_path)

Summary

Undefined variable: Ensure that input_dir is available in main.
Missing imports: Add import numpy as np and import unproject_depth from its correct location.
Logic error: Avoid modifying ixt in-place for every frame; use a copy instead.

With these changes, your script should run more reliably.

Below is a revised version of your script that addresses the issues and includes some minor optimizations:

Fixed variable scope: The input_dir is now returned from load_data so it can be used in main.
Added missing imports: The script now imports NumPy and (assuming its location) the unproject_depth function.
Avoid in-place camera matrix modification: A copy of the camera intrinsics is made for each frame to prevent cumulative scaling.
Unique point cloud export: Each frame now exports its point cloud to a uniquely named file so that previous results aren’t overwritten.

import os
import glob
import numpy as np
import tyro
import trimesh
from tqdm.auto import tqdm

from promptda.utils.io_wrapper import load_image, load_depth, save_depth
from promptda.utils.parallel_utils import parallel_execution
from promptda.promptda import PromptDA
from promptda.utils.depth_utils import unproject_depth  # Adjust the import as needed

def load_data(input_path: str, max_size: int):
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)

    # Unzip if necessary
    if not os.path.exists(input_dir):
        cmd = f'unzip -o {input_path} -d {root_dir}'
        os.system(cmd)

    # Extract frames from video if 'rgb' directory doesn't exist
    rgb_dir = os.path.join(input_dir, 'rgb')
    if not os.path.exists(rgb_dir):
        os.makedirs(rgb_dir, exist_ok=True)
        cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {rgb_dir}/%06d.jpg'
        os.system(cmd)

    rgb_files = sorted(glob.glob(os.path.join(rgb_dir, '*.jpg')))
    prompt_depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))

    # Ensure both lists have the same number of items
    if len(rgb_files) != len(prompt_depth_files):
        min_len = min(len(rgb_files), len(prompt_depth_files))
        rgb_files = rgb_files[:min_len]
        prompt_depth_files = prompt_depth_files[:min_len]

    rgbs = parallel_execution(
        rgb_files,
        to_tensor=True,
        max_size=max_size,
        action=load_image,
        num_processes=32,
        print_progress=True,
        desc='Loading RGB images'
    )

    prompt_depths = parallel_execution(
        prompt_depth_files,
        to_tensor=True,
        action=load_depth,
        num_processes=32,
        print_progress=True,
        desc='Loading Prompt Depth'
    )

    return rgbs, prompt_depths, input_dir

def main(
    input_path: str = 'data/8b98276b0a.zip',
    output_path: str = 'data/8b98276b0a_results',
    max_size: int = 1008,
):
    os.makedirs(output_path, exist_ok=True)
    rgbs, prompt_depths, input_dir = load_data(input_path, max_size)

    # Load camera intrinsics from CSV
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    ixt = np.loadtxt(ixt_path, delimiter=',')
    orig_max = 1920

    DEVICE = 'cuda'
    model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

    # Process each frame
    for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
        rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
        depth = model.predict(rgb, prompt_depth)

        # Save depth output
        frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
        save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

        # Scale the camera matrix for the current frame
        now_max = max(rgb.shape[1], rgb.shape[0])
        scale = orig_max / now_max
        ixt_frame = ixt.copy()
        ixt_frame[:2] = ixt_frame[:2] / scale

        # Unproject the depth to generate a point cloud
        points, colors = unproject_depth(depth, ixt=ixt_frame, color=rgb, ret_pcd=False)
        pcd = trimesh.PointCloud(vertices=points, colors=colors)

        # Save the point cloud with a unique filename for each frame
        ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
        pcd.export(ply_path)

if __name__ == "__main__":
    tyro.cli(main)

Notes:

Import Paths: Make sure the import for unproject_depth (and any other module paths) matches your project structure.
System Commands: The script uses os.system to unzip files and extract frames with ffmpeg. For a more robust implementation, consider using Python libraries (e.g., zipfile or a dedicated ffmpeg wrapper) and proper error handling.
Performance Considerations: The use of parallel processing (via parallel_execution) and GPU inference with CUDA should provide good performance on supported hardware.

This version should run more reliably and optimally while maintaining the intended functionality.

Below is an extensively documented version of the unproject_depth function. The documentation explains its purpose, parameters, return types, potential exceptions, and includes an example of how to use it.

def unproject_depth(depth,
                    ixt,
                    depth_min=0.01,
                    depth_max=None,
                    color=None,
                    ext=None,
                    conf=None,
                    ret_pcd=False,
                    clip_box=None):
    """
    Unproject a depth map into a 3D point cloud using camera intrinsic parameters and optional transformations.

    This function converts a 2D depth map into a set of 3D points by performing the following steps:

      1. **Coordinate Grid Generation:**
         Generates a grid of pixel coordinates corresponding to the depth image dimensions.

      2. **Flattening and Masking:**
         Flattens the x and y coordinate grids along with the depth values.
         Applies filtering based on:
           - A minimum depth threshold (`depth_min`),
           - An optional maximum depth threshold (`depth_max`),
           - And, if provided, a confidence map (`conf`) that retains only pixels with a confidence value equal to 2.

      3. **3D Point Calculation:**
         Constructs homogeneous 2D pixel coordinates by stacking x, y, and a constant value of 1.
         Multiplies these coordinates by the corresponding depth values to obtain scaled 3D coordinates in pixel space.
         Transforms these coordinates into camera space by applying the inverse of the camera intrinsic matrix (`ixt`).

      4. **Optional Extrinsic Transformation:**
         If an extrinsic matrix (`ext`) is provided, the function converts the 3D points from camera coordinates to another coordinate system (e.g., world coordinates).
         This is done by converting the points to homogeneous coordinates, then applying the inverse of the extrinsic matrix.

      5. **Clipping (Optional):**
         If a clipping box (`clip_box`) is specified, further filters the points based on the provided 3D bounds.
         The `clip_box` should be a list or tuple of 6 values:
           `[x_max, y_max, z_max, x_min, y_min, z_min]`
         Each value can be `None` if no clipping is required for that boundary.

      6. **Color Association:**
         If a color image (`color`) is provided, the function maps corresponding color values to the valid 3D points.
         If the color image has a data type of unsigned 8-bit integers, it is converted to float values in the range [0, 1].

      7. **Return Format:**
         - If `ret_pcd` is set to **False**, the function returns:
             - A tuple `(points, colors)` if `color` is provided, where:
               - `points` is a NumPy array of shape (N, 3) containing 3D coordinates.
               - `colors` is a NumPy array of shape (N, 3) with the corresponding color values.
             - Otherwise, only the NumPy array `points` is returned.
         - If `ret_pcd` is **True**, the function returns an Open3D `PointCloud` object, populated with points and, if available, colors.

    Parameters:
      depth (np.ndarray):
          A 2D array (height x width) representing the depth map. Each element is a depth value in the same unit as the camera parameters.

      ixt (np.ndarray):
          A 3x3 camera intrinsic matrix used to convert pixel coordinates into normalized camera coordinates.

      depth_min (float, optional):
          The minimum valid depth value. Any depth value below this threshold is discarded.
          Defaults to 0.01.

      depth_max (float, optional):
          The maximum valid depth value. Any depth value above this threshold is discarded.
          If set to `None`, no upper threshold is applied.

      color (np.ndarray, optional):
          An optional color image associated with the depth map. Expected shape is (height, width, 3).
          If provided, the function returns the corresponding colors for each valid 3D point.

      ext (np.ndarray, optional):
          An optional 4x4 extrinsic matrix used to transform points from camera coordinates to another coordinate system (e.g., world coordinates).

      conf (np.ndarray, optional):
          An optional 2D array with the same shape as `depth` that represents a confidence map for the depth values.
          Only pixels with a confidence value equal to 2 are retained.

      ret_pcd (bool, optional):
          If `True`, the function returns an Open3D `PointCloud` object containing the points (and colors, if available).
          If `False`, the function returns NumPy arrays. Defaults to `False`.

      clip_box (list or tuple, optional):
          A list or tuple containing 6 values `[x_max, y_max, z_max, x_min, y_min, z_min]` that define a 3D clipping box.
          Points falling outside these bounds are discarded. Each bound can be set to `None` to indicate no clipping for that side.

    Returns:
      If `color` is provided:
        - When `ret_pcd` is **False**:
            Tuple[np.ndarray, np.ndarray]:
            - `points`: A NumPy array of shape (N, 3) containing the unprojected 3D coordinates.
            - `colors`: A NumPy array of shape (N, 3) containing the corresponding color values.
        - When `ret_pcd` is **True**:
            open3d.geometry.PointCloud:
            An Open3D PointCloud object with points and associated colors.

      If `color` is not provided:
        - When `ret_pcd` is **False**:
            np.ndarray:
            A NumPy array of shape (N, 3) containing the unprojected 3D coordinates.
        - When `ret_pcd` is **True**:
            open3d.geometry.PointCloud:
            An Open3D PointCloud object containing the 3D points.

    Raises:
      AssertionError:
          If a `clip_box` is provided but does not contain exactly 6 elements.

    Example:
      >>> import numpy as np
      >>> # Create a synthetic depth map of size 480x640 with values between 0.5 and 2.0 meters.
      >>> depth = np.random.uniform(0.5, 2.0, (480, 640))
      >>> # Define a sample camera intrinsic matrix.
      >>> ixt = np.array([[600, 0, 320],
      ...                 [0, 600, 240],
      ...                 [0,   0,   1]])
      >>> # Unproject the depth map into 3D points.
      >>> points = unproject_depth(depth, ixt, depth_min=0.5, depth_max=2.0)
      >>> print(points.shape)
      (N, 3)  # Where N is the number of points after depth filtering.

    Note:
      - The function expects the depth values and intrinsic matrix to be in consistent units.
      - When an extrinsic matrix is provided, the transformation is applied after computing the 3D camera coordinates.
      - The clipping box should be specified in the coordinate space of the output points.
    """
    height, width = depth.shape
    x = np.arange(0, width)
    y = np.arange(0, height)
    xx, yy = np.meshgrid(x, y)
    xx = xx.reshape(-1)
    yy = yy.reshape(-1)
    zz = depth.reshape(-1)
    mask = np.ones_like(xx, dtype=np.bool_)

    # Apply depth thresholds.
    if depth_min is not None:
        mask &= zz >= depth_min
    if depth_max is not None:
        mask &= zz <= depth_max

    # Apply confidence filtering if provided.
    if conf is not None:
        mask &= conf.reshape(-1) == 2

    xx = xx[mask]
    yy = yy[mask]
    zz = zz[mask]

    # Construct homogeneous pixel coordinates and scale by depth.
    pcd = np.stack([xx, yy, np.ones_like(xx)], axis=1)
    pcd = pcd * zz[:, None]

    # Transform pixel coordinates into 3D camera coordinates.
    pcd = np.dot(pcd, np.linalg.inv(ixt).T)

    # Apply extrinsic transformation if provided.
    if ext is not None:
        pcd = np.concatenate([pcd, np.ones((pcd.shape[0], 1))], axis=1)
        pcd = np.dot(pcd, np.linalg.inv(ext).T)

    # Apply clipping if a clip_box is provided.
    new_mask = np.ones_like(pcd[:, 0]).astype(np.bool_)
    if clip_box is not None:
        assert len(clip_box) == 6, "clip_box must be a list or tuple of 6 elements: [x_max, y_max, z_max, x_min, y_min, z_min]"
        for i, val in enumerate(clip_box):
            if val is None:
                continue
            if i == 0:
                new_mask &= (pcd[:, 0] <= val)
            elif i == 1:
                new_mask &= (pcd[:, 1] <= val)
            elif i == 2:
                new_mask &= (pcd[:, 2] <= val)
            elif i == 3:
                new_mask &= (pcd[:, 0] >= val)
            elif i == 4:
                new_mask &= (pcd[:, 1] >= val)
            elif i == 5:
                new_mask &= (pcd[:, 2] >= val)

    # Process color information if provided.
    if color is not None:
        if color.dtype == np.uint8:
            color = color.astype(np.float32) / 255.
        if ret_pcd:
            points = pcd
            import open3d as o3d  # Local import to avoid dependency issues if not using pcd
            pcd_obj = o3d.geometry.PointCloud()
            pcd_obj.points = o3d.utility.Vector3dVector(points[:, :3][new_mask])
            pcd_obj.colors = o3d.utility.Vector3dVector(color.reshape(-1, 3)[mask][new_mask])
            return pcd_obj
        else:
            return pcd[:, :3][new_mask], color.reshape(-1, 3)[mask][new_mask]
    else:
        if ret_pcd:
            import open3d as o3d
            pcd_obj = o3d.geometry.PointCloud()
            pcd_obj.points = o3d.utility.Vector3dVector(pcd[:, :3][new_mask])
            return pcd_obj
        else:
            return pcd[:, :3][new_mask]

Explanation:

Purpose:
The function unproject_depth takes a depth image and converts it into 3D coordinates (point cloud) using the camera intrinsics. Optionally, it can also apply extrinsic transformations, filter by confidence or depth thresholds, and associate color data with each point.
Parameters and Return Values:
Every parameter is explained in detail in the docstring. Depending on the provided options (like color and ret_pcd), the function returns either NumPy arrays or an Open3D PointCloud.
Usage Example:
An example is provided within the docstring to illustrate how to call the function with a synthetic depth map and a sample intrinsic matrix.

This extensive documentation should help users understand the inner workings of the function and how to effectively utilize it in their projects.

this is the odometry information from the iphone camera. and this is the script i want to modify. i want to read the odometry.csv, and apply the extrinsic matrix to each frame

def main ( input_path: str = ‘data/8b98276b0a.zip’, output_path: str = ‘data/8b98276b0a_results’, max_size: int = 1008, ): os.makedirs(output_path, exist_ok=True) rgbs, prompt_depths, input_dir = load_data(input_path, max_size)

# Load camera intrinsics from CSV
ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
ixt = np.loadtxt(ixt_path, delimiter=',')
orig_max = 1920

DEVICE = 'cuda'
model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

# Process each frame
for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
    rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
    depth = model.predict(rgb, prompt_depth)

    # Save depth output
    frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
    save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

    # Scale the camera matrix for the current frame
    now_max = max(rgb.shape[1], rgb.shape[0])
    scale = orig_max / now_max
    ixt_frame = ixt.copy()
    ixt_frame[:2] = ixt_frame[:2] / scale

    # Unproject the depth to generate a point cloud
    points, colors = unproject_depth(depth[0, 0].detach().cpu().numpy(), ixt=ixt_frame, color=rgb, ret_pcd=False)

    pcd = trimesh.PointCloud(vertices=points, colors=colors.cpu())

    # Save the point cloud with a unique filename for each frame
    ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
    pcd.export(ply_path)

Below is a modified version of your script that reads an “odometry.csv” file containing per-frame extrinsic matrices and applies the appropriate extrinsic transformation when unprojecting the depth for each frame. In this example, we assume that each row of the CSV contains 16 values (i.e. a flattened 4×4 matrix) corresponding to a frame. If a frame is encountered without corresponding odometry data, the identity matrix is used as a fallback.

import os
import glob
import numpy as np
import tyro
import trimesh
from tqdm.auto import tqdm

from promptda.utils.io_wrapper import load_image, load_depth, save_depth
from promptda.utils.parallel_utils import parallel_execution
from promptda.promptda import PromptDA
from promptda.utils.depth_utils import unproject_depth  # Adjust the import as needed

def load_data(input_path: str, max_size: int):
    """
    Load scene data from a zip file, including RGB images and depth maps.

    If the directory does not exist, the function unzips the archive and extracts
    RGB frames (if not already extracted) from a video file. It also ensures that
    the number of RGB images and prompt depth images match.

    Parameters:
      input_path (str): Path to the input ZIP file.
      max_size (int): Maximum size to which images are resized.

    Returns:
      tuple: (rgbs, prompt_depths, input_dir)
             - rgbs: List of loaded RGB images (tensors).
             - prompt_depths: List of loaded depth images (tensors).
             - input_dir: Directory containing the extracted scene data.
    """
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)

    # Unzip the file if the directory doesn't exist.
    if not os.path.exists(input_dir):
        cmd = f'unzip -o {input_path} -d {root_dir}'
        os.system(cmd)

    # Extract RGB frames from video if the 'rgb' directory doesn't exist.
    rgb_dir = os.path.join(input_dir, 'rgb')
    if not os.path.exists(rgb_dir):
        os.makedirs(rgb_dir, exist_ok=True)
        cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {rgb_dir}/%06d.jpg'
        os.system(cmd)

    rgb_files = sorted(glob.glob(os.path.join(rgb_dir, '*.jpg')))
    prompt_depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))

    # Ensure both lists have the same number of files.
    if len(rgb_files) != len(prompt_depth_files):
        min_len = min(len(rgb_files), len(prompt_depth_files))
        rgb_files = rgb_files[:min_len]
        prompt_depth_files = prompt_depth_files[:min_len]

    rgbs = parallel_execution(
        rgb_files,
        to_tensor=True,
        max_size=max_size,
        action=load_image,
        num_processes=32,
        print_progress=True,
        desc='Loading RGB images'
    )

    prompt_depths = parallel_execution(
        prompt_depth_files,
        to_tensor=True,
        action=load_depth,
        num_processes=32,
        print_progress=True,
        desc='Loading Prompt Depth'
    )

    return rgbs, prompt_depths, input_dir

def main (
        input_path: str = 'data/8b98276b0a.zip',
        output_path: str = 'data/8b98276b0a_results',
        max_size: int = 1008,
):
    """
    Main function to process scene data and generate point clouds with extrinsic transformations.

    This function performs the following steps:
      1. Loads RGB and depth data from the specified input path.
      2. Loads the camera intrinsics from 'camera_matrix.csv'.
      3. Loads odometry data from 'odometry.csv', where each row is assumed to contain a
         flattened 4x4 extrinsic matrix corresponding to a frame.
      4. For each frame:
         - Performs depth prediction using the PromptDA model.
         - Saves the predicted depth map.
         - Scales the camera intrinsic matrix according to the current frame dimensions.
         - Retrieves and applies the corresponding extrinsic matrix (from odometry) to the
           unprojection of the depth map.
         - Generates and exports a point cloud for the frame.

    Parameters:
      input_path (str): Path to the input ZIP file containing the scene data.
      output_path (str): Directory where the output depth maps and point clouds will be saved.
      max_size (int): Maximum size for resizing images.
    """
    # Create the output directory if it doesn't exist.
    os.makedirs(output_path, exist_ok=True)

    # Load RGB and depth images along with the directory where data is stored.
    rgbs, prompt_depths, input_dir = load_data(input_path, max_size)

    # Load camera intrinsics from CSV.
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    ixt = np.loadtxt(ixt_path, delimiter=',')
    orig_max = 1920

    # Load odometry data from CSV.
    # Each row is assumed to be a flattened 4x4 matrix (16 values) for a frame.
    odometry_path = os.path.join(input_dir, 'odometry.csv')
    odometry_data = np.loadtxt(odometry_path, delimiter=',')
    num_frames_odometry = odometry_data.shape[0]

    DEVICE = 'cuda'
    model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

    # Process each frame.
    for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
        rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
        depth = model.predict(rgb, prompt_depth)

        # Save the predicted depth map.
        frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
        save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

        # Adjust the intrinsic matrix based on the current frame size.
        now_max = max(rgb.shape[1], rgb.shape[0])
        scale = orig_max / now_max
        ixt_frame = ixt.copy()
        ixt_frame[:2] = ixt_frame[:2] / scale

        # Retrieve the corresponding extrinsic matrix from the odometry data.
        # If there is no extrinsic data for the current frame, default to the identity matrix.
        if frame_idx < num_frames_odometry:
            ext = odometry_data[frame_idx].reshape(4, 4)
        else:
            ext = np.eye(4)

        # Unproject the depth map into a 3D point cloud.
        # Convert the depth tensor to a numpy array before unprojection.
        depth_np = depth[0, 0].detach().cpu().numpy()
        points, colors = unproject_depth(depth_np, ixt=ixt_frame, color=rgb, ext=ext, ret_pcd=False)

        # Convert colors to numpy if they are still tensors.
        if hasattr(colors, "cpu"):
            colors = colors.cpu().numpy()

        # Create a point cloud using trimesh.
        pcd = trimesh.PointCloud(vertices=points, colors=colors)

        # Export the point cloud to a PLY file, using a unique filename for each frame.
        ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
        pcd.export(ply_path)

if __name__ == "__main__":
    tyro.cli(main)

Key Modifications and Explanations

Loading Odometry Data:
- The script loads the odometry information from odometry.csv using np.loadtxt.
- We assume that each row contains 16 values (a flattened 4×4 matrix).
- The variable num_frames_odometry is used to ensure that for each frame, an extrinsic matrix is available. If not, the identity matrix (np.eye(4)) is used.
Applying the Extrinsic Matrix:
- For each frame, the corresponding extrinsic matrix is extracted using:
```
ext = odometry_data[frame_idx].reshape(4, 4)
```
- This extrinsic matrix is then passed to the unproject_depth function via its ext parameter.
Additional Comments and Robustness:
- Extensive comments and docstrings have been added to explain each step.
- The script verifies that odometry data is available for the current frame; otherwise, it defaults to using an identity transformation.

With these modifications, each frame’s point cloud is now unprojected using both the intrinsic parameters and the corresponding extrinsic transformation derived from your iPhone’s odometry data.

Below is an updated version of your script. This version addresses the following changes:

Skip the Header Row:
When reading odometry.csv, we skip the header row using the skiprows=1 argument.
Convert Odometry Values to a 4×4 Extrinsic Matrix:
Each odometry row now contains 7 values in the order:
[x, y, z, qx, qy, qz, qw]
The script converts these values into a 4×4 transformation matrix by first converting the quaternion into a rotation matrix (using scipy.spatial.transform.Rotation) and then placing the translation vector in the last column.

Below is the corrected script:

import os
import glob
import numpy as np
import tyro
import trimesh
from tqdm.auto import tqdm
from scipy.spatial.transform import Rotation as R  # For converting quaternions to rotation matrices

from promptda.utils.io_wrapper import load_image, load_depth, save_depth
from promptda.utils.parallel_utils import parallel_execution
from promptda.promptda import PromptDA
from promptda.utils.depth_utils import unproject_depth  # Adjust the import as needed

def load_data(input_path: str, max_size: int):
    """
    Load scene data from a ZIP file, including RGB images and prompt depth maps.

    If the directory does not exist, the function unzips the archive and extracts
    RGB frames (if not already extracted) from a video file. It also ensures that
    the number of RGB images and prompt depth images match.

    Parameters:
      input_path (str): Path to the input ZIP file.
      max_size (int): Maximum size to which images are resized.

    Returns:
      tuple: (rgbs, prompt_depths, input_dir)
             - rgbs: List of loaded RGB images (tensors).
             - prompt_depths: List of loaded depth images (tensors).
             - input_dir: Directory containing the extracted scene data.
    """
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)

    # Unzip the file if the directory doesn't exist.
    if not os.path.exists(input_dir):
        cmd = f'unzip -o {input_path} -d {root_dir}'
        os.system(cmd)

    # Extract RGB frames from video if the 'rgb' directory doesn't exist.
    rgb_dir = os.path.join(input_dir, 'rgb')
    if not os.path.exists(rgb_dir):
        os.makedirs(rgb_dir, exist_ok=True)
        cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {rgb_dir}/%06d.jpg'
        os.system(cmd)

    rgb_files = sorted(glob.glob(os.path.join(rgb_dir, '*.jpg')))
    prompt_depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))

    # Ensure both lists have the same number of files.
    if len(rgb_files) != len(prompt_depth_files):
        min_len = min(len(rgb_files), len(prompt_depth_files))
        rgb_files = rgb_files[:min_len]
        prompt_depth_files = prompt_depth_files[:min_len]

    rgbs = parallel_execution(
        rgb_files,
        to_tensor=True,
        max_size=max_size,
        action=load_image,
        num_processes=32,
        print_progress=True,
        desc='Loading RGB images'
    )

    prompt_depths = parallel_execution(
        prompt_depth_files,
        to_tensor=True,
        action=load_depth,
        num_processes=32,
        print_progress=True,
        desc='Loading Prompt Depth'
    )

    return rgbs, prompt_depths, input_dir

def main (
        input_path: str = 'data/8b98276b0a.zip',
        output_path: str = 'data/8b98276b0a_results',
        max_size: int = 1008,
):
    """
    Process scene data and generate point clouds by applying extrinsic transformations from odometry.

    This function performs the following steps:
      1. Loads RGB and depth data from the specified input path.
      2. Loads the camera intrinsics from 'camera_matrix.csv'.
      3. Loads odometry data from 'odometry.csv'. The odometry file is expected to have a header row,
         and each subsequent row contains 7 values in the order:
           [x, y, z, qx, qy, qz, qw],
         representing the translation and quaternion (for rotation).
      4. Converts each odometry row into a 4x4 extrinsic transformation matrix.
      5. For each frame:
         - Performs depth prediction using the PromptDA model.
         - Saves the predicted depth map.
         - Adjusts the intrinsic matrix based on the current frame size.
         - Applies the corresponding extrinsic transformation to unproject the depth map.
         - Generates and exports a point cloud for the frame.

    Parameters:
      input_path (str): Path to the input ZIP file containing the scene data.
      output_path (str): Directory where the output depth maps and point clouds will be saved.
      max_size (int): Maximum size for resizing images.
    """
    # Create the output directory if it doesn't exist.
    os.makedirs(output_path, exist_ok=True)

    # Load RGB and depth images along with the directory where data is stored.
    rgbs, prompt_depths, input_dir = load_data(input_path, max_size)

    # Load camera intrinsics from CSV.
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    ixt = np.loadtxt(ixt_path, delimiter=',')
    orig_max = 1920

    # Load odometry data from CSV.
    # Skip the header row and expect each row to have 7 values: [x, y, z, qx, qy, qz, qw]
    odometry_path = os.path.join(input_dir, 'odometry.csv')
    odometry_data = np.loadtxt(odometry_path, delimiter=',', skiprows=1)
    num_frames_odometry = odometry_data.shape[0]

    DEVICE = 'cuda'
    model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

    # Process each frame.
    for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
        rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
        depth = model.predict(rgb, prompt_depth)

        # Save the predicted depth map.
        frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
        save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

        # Adjust the intrinsic matrix based on the current frame dimensions.
        now_max = max(rgb.shape[1], rgb.shape[0])
        scale = orig_max / now_max
        ixt_frame = ixt.copy()
        ixt_frame[:2] = ixt_frame[:2] / scale

        # Retrieve and convert the corresponding odometry data into a 4x4 extrinsic matrix.
        if frame_idx < num_frames_odometry:
            odom = odometry_data[frame_idx]  # odom is [x, y, z, qx, qy, qz, qw]
            trans = odom[:3]               # Translation vector (x, y, z)
            quat = odom[3:]                # Quaternion (qx, qy, qz, qw)
            # Convert quaternion to a rotation matrix.
            rot = R.from_quat(quat).as_matrix()
            # Construct the 4x4 extrinsic matrix.
            ext = np.eye(4)
            ext[:3, :3] = rot
            ext[:3, 3] = trans
        else:
            ext = np.eye(4)

        # Unproject the depth map into a 3D point cloud.
        depth_np = depth[0, 0].detach().cpu().numpy()
        points, colors = unproject_depth(depth_np, ixt=ixt_frame, color=rgb, ext=ext, ret_pcd=False)

        # If colors are returned as tensors, convert them to numpy arrays.
        if hasattr(colors, "cpu"):
            colors = colors.cpu().numpy()

        # Create a point cloud using trimesh.
        pcd = trimesh.PointCloud(vertices=points, colors=colors)

        # Export the point cloud to a PLY file with a unique filename.
        ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
        pcd.export(ply_path)

if __name__ == "__main__":
    tyro.cli(main)

Key Points Explained

Skipping the Header:
The call to np.loadtxt(odometry_path, delimiter=',', skiprows=1) ensures that the header row is ignored.
Converting Odometry Data:
Each row from the odometry file is assumed to have 7 values. The first three values represent the translation vector, and the remaining four represent the quaternion. The quaternion is converted into a rotation matrix using scipy.spatial.transform.Rotation.from_quat, and these values are assembled into a 4×4 transformation matrix.
Extrinsic Matrix Application:
The generated extrinsic matrix (ext) is passed into the unproject_depth function to transform the unprojected points accordingly.

This updated script should correctly process your odometry information and apply the extrinsic transformations to each frame’s point cloud.

Below is an updated version of your script. In this version, when loading the odometry data:

The header row is skipped.
Each row is assumed to have 9 values.
The first two columns (timestamp and frame) are dropped, and the remaining 7 values (x, y, z, qx, qy, qz, qw) are used to construct a 4×4 extrinsic matrix.

import os
import glob
import numpy as np
import tyro
import trimesh
from tqdm.auto import tqdm
from scipy.spatial.transform import Rotation as R  # For converting quaternions to rotation matrices

from promptda.utils.io_wrapper import load_image, load_depth, save_depth
from promptda.utils.parallel_utils import parallel_execution
from promptda.promptda import PromptDA
from promptda.utils.depth_utils import unproject_depth  # Adjust the import as needed

def load_data(input_path: str, max_size: int):
    """
    Load scene data from a ZIP file, including RGB images and prompt depth maps.

    If the directory does not exist, the function unzips the archive and extracts
    RGB frames (if not already extracted) from a video file. It also ensures that
    the number of RGB images and prompt depth images match.

    Parameters:
      input_path (str): Path to the input ZIP file.
      max_size (int): Maximum size to which images are resized.

    Returns:
      tuple: (rgbs, prompt_depths, input_dir)
             - rgbs: List of loaded RGB images (tensors).
             - prompt_depths: List of loaded depth images (tensors).
             - input_dir: Directory containing the extracted scene data.
    """
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)

    # Unzip the file if the directory doesn't exist.
    if not os.path.exists(input_dir):
        cmd = f'unzip -o {input_path} -d {root_dir}'
        os.system(cmd)

    # Extract RGB frames from video if the 'rgb' directory doesn't exist.
    rgb_dir = os.path.join(input_dir, 'rgb')
    if not os.path.exists(rgb_dir):
        os.makedirs(rgb_dir, exist_ok=True)
        cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {rgb_dir}/%06d.jpg'
        os.system(cmd)

    rgb_files = sorted(glob.glob(os.path.join(rgb_dir, '*.jpg')))
    prompt_depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))

    # Ensure both lists have the same number of files.
    if len(rgb_files) != len(prompt_depth_files):
        min_len = min(len(rgb_files), len(prompt_depth_files))
        rgb_files = rgb_files[:min_len]
        prompt_depth_files = prompt_depth_files[:min_len]

    rgbs = parallel_execution(
        rgb_files,
        to_tensor=True,
        max_size=max_size,
        action=load_image,
        num_processes=32,
        print_progress=True,
        desc='Loading RGB images'
    )

    prompt_depths = parallel_execution(
        prompt_depth_files,
        to_tensor=True,
        action=load_depth,
        num_processes=32,
        print_progress=True,
        desc='Loading Prompt Depth'
    )

    return rgbs, prompt_depths, input_dir

def main (
        input_path: str = 'data/8b98276b0a.zip',
        output_path: str = 'data/8b98276b0a_results',
        max_size: int = 1008,
):
    """
    Process scene data and generate point clouds by applying extrinsic transformations from odometry.

    This function performs the following steps:
      1. Loads RGB and depth data from the specified input path.
      2. Loads the camera intrinsics from 'camera_matrix.csv'.
      3. Loads odometry data from 'odometry.csv'. The odometry file is expected to have a header row,
         and each subsequent row contains 9 values in the order:
           [timestamp, frame, x, y, z, qx, qy, qz, qw].
         The first two columns are dropped, and the remaining values are used to create a 4x4 extrinsic matrix.
      4. Converts each odometry row into a 4x4 extrinsic transformation matrix.
      5. For each frame:
         - Performs depth prediction using the PromptDA model.
         - Saves the predicted depth map.
         - Adjusts the intrinsic matrix based on the current frame size.
         - Applies the corresponding extrinsic transformation to unproject the depth map.
         - Generates and exports a point cloud for the frame.

    Parameters:
      input_path (str): Path to the input ZIP file containing the scene data.
      output_path (str): Directory where the output depth maps and point clouds will be saved.
      max_size (int): Maximum size for resizing images.
    """
    # Create the output directory if it doesn't exist.
    os.makedirs(output_path, exist_ok=True)

    # Load RGB and depth images along with the directory where data is stored.
    rgbs, prompt_depths, input_dir = load_data(input_path, max_size)

    # Load camera intrinsics from CSV.
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    ixt = np.loadtxt(ixt_path, delimiter=',')
    orig_max = 1920

    # Load odometry data from CSV.
    # Skip the header row and expect each row to have 9 values:
    # [timestamp, frame, x, y, z, qx, qy, qz, qw]
    odometry_path = os.path.join(input_dir, 'odometry.csv')
    odometry_data = np.loadtxt(odometry_path, delimiter=',', skiprows=1)
    num_frames_odometry = odometry_data.shape[0]

    DEVICE = 'cuda'
    model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

    # Process each frame.
    for frame_idx, (rgb, prompt_depth) in tqdm(enumerate(zip(rgbs, prompt_depths)), desc='Inferring', total=len(rgbs)):
        rgb, prompt_depth = rgb.to(DEVICE), prompt_depth.to(DEVICE)
        depth = model.predict(rgb, prompt_depth)

        # Save the predicted depth map.
        frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
        save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

        # Adjust the intrinsic matrix based on the current frame dimensions.
        now_max = max(rgb.shape[1], rgb.shape[0])
        scale = orig_max / now_max
        ixt_frame = ixt.copy()
        ixt_frame[:2] = ixt_frame[:2] / scale

        # Retrieve and convert the corresponding odometry data into a 4x4 extrinsic matrix.
        if frame_idx < num_frames_odometry:
            # Drop the first two columns (timestamp and frame) and take the next 7 values.
            odom = odometry_data[frame_idx, 2:]  # odom is now [x, y, z, qx, qy, qz, qw]
            trans = odom[:3]               # Translation vector (x, y, z)
            quat = odom[3:]                # Quaternion (qx, qy, qz, qw)
            # Convert quaternion to a rotation matrix.
            rot = R.from_quat(quat).as_matrix()
            # Construct the 4x4 extrinsic matrix.
            ext = np.eye(4)
            ext[:3, :3] = rot
            ext[:3, 3] = trans
        else:
            ext = np.eye(4)

        # Unproject the depth map into a 3D point cloud.
        depth_np = depth[0, 0].detach().cpu().numpy()
        points, colors = unproject_depth(depth_np, ixt=ixt_frame, color=rgb, ext=ext, ret_pcd=False)

        # If colors are returned as tensors, convert them to numpy arrays.
        if hasattr(colors, "cpu"):
            colors = colors.cpu().numpy()

        # Create a point cloud using trimesh.
        pcd = trimesh.PointCloud(vertices=points, colors=colors)

        # Export the point cloud to a PLY file with a unique filename.
        ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
        pcd.export(ply_path)

if __name__ == "__main__":
    tyro.cli(main)

Explanation of the Changes

Skipping the Header Row:
The odometry CSV is loaded with skiprows=1 so that the header row is not processed.
Dropping the First Two Columns:
For each frame, after loading the odometry data, the first two columns (timestamp and frame) are dropped by taking odometry_data[frame_idx, 2:], leaving the 7 values needed to construct the transformation.
Constructing the Extrinsic Matrix:
The remaining values are split into a translation vector and a quaternion. The quaternion is converted to a rotation matrix using scipy.spatial.transform.Rotation.from_quat, and then the 4×4 extrinsic matrix is built by placing the rotation matrix and translation vector in the appropriate locations.

This revised script will now properly convert the odometry CSV rows into usable extrinsic matrices for each frame, applying the transformation when unprojecting the depth map into a point cloud.

This line converts the predicted depth tensor into a NumPy array so that it can be processed by functions (like unproject_depth) that expect a NumPy input. Here’s a breakdown:

depth[0, 0]:
The model’s output is typically a multi-dimensional tensor (often with dimensions like [batch, channel, height, width]). Since you’re processing one image at a time, depth[0, 0] extracts the actual depth map from the tensor by selecting the first (or only) item in the batch and the first channel.
.detach():
This removes the tensor from PyTorch’s computation graph. It ensures that no gradients are tracked for this operation since you’re only using the depth values for further processing (and not for backpropagation).
.cpu():
If the tensor was on the GPU, this moves it to the CPU, which is necessary because NumPy operations require data to be in CPU memory.
.numpy():
Finally, this converts the CPU tensor into a NumPy array.

In summary, the entire line prepares the depth map by extracting it from the tensor, detaching it from the gradient computation, moving it to the CPU, and converting it to a NumPy array for subsequent processing.

Below is the updated script that loads a folder of confidence frames (assumed to be stored as PNG images in a folder named “confidence”) and passes the corresponding confidence frame to the unprojection function. In this version, the confidence frames are loaded in parallel (using the same helper function as for depth images) and then used in the unprojection step.

import os
import glob
import numpy as np
import tyro
import trimesh
from tqdm.auto import tqdm
from scipy.spatial.transform import Rotation as R  # For converting quaternions to rotation matrices

from promptda.utils.io_wrapper import load_image, load_depth, save_depth
from promptda.utils.parallel_utils import parallel_execution
from promptda.promptda import PromptDA
from promptda.utils.depth_utils import unproject_depth  # Adjust the import as needed

def load_data(input_path: str, max_size: int):
    """
    Load scene data from a ZIP file, including RGB images, prompt depth maps, and confidence frames.

    If the directory does not exist, the function unzips the archive and extracts
    RGB frames (if not already extracted) from a video file. It also ensures that
    the number of RGB images, depth images, and confidence frames match.

    Parameters:
      input_path (str): Path to the input ZIP file.
      max_size (int): Maximum size to which images are resized.

    Returns:
      tuple: (rgbs, prompt_depths, confs, input_dir)
             - rgbs: List of loaded RGB images (tensors).
             - prompt_depths: List of loaded depth images (tensors).
             - confs: List of loaded confidence images (tensors).
             - input_dir: Directory containing the extracted scene data.
    """
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)

    # Unzip the file if the directory doesn't exist.
    if not os.path.exists(input_dir):
        cmd = f'unzip -o {input_path} -d {root_dir}'
        os.system(cmd)

    # Extract RGB frames from video if the 'rgb' directory doesn't exist.
    rgb_dir = os.path.join(input_dir, 'rgb')
    if not os.path.exists(rgb_dir):
        os.makedirs(rgb_dir, exist_ok=True)
        cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {rgb_dir}/%06d.jpg'
        os.system(cmd)

    rgb_files = sorted(glob.glob(os.path.join(rgb_dir, '*.jpg')))
    depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))
    conf_dir = os.path.join(input_dir, 'confidence')
    confidence_files = sorted(glob.glob(os.path.join(conf_dir, '*.png')))

    # Ensure all three lists have the same number of files.
    min_len = min(len(rgb_files), len(depth_files), len(confidence_files))
    rgb_files = rgb_files[:min_len]
    depth_files = depth_files[:min_len]
    confidence_files = confidence_files[:min_len]

    rgbs = parallel_execution(
        rgb_files,
        to_tensor=True,
        max_size=max_size,
        action=load_image,
        num_processes=32,
        print_progress=True,
        desc='Loading RGB images'
    )

    prompt_depths = parallel_execution(
        depth_files,
        to_tensor=True,
        action=load_depth,
        num_processes=32,
        print_progress=True,
        desc='Loading Depth frames'
    )

    confs = parallel_execution(
        confidence_files,
        to_tensor=True,
        max_size=max_size,
        action=load_depth,  # Assumes confidence images can be loaded similarly to depth images.
        num_processes=32,
        print_progress=True,
        desc='Loading Confidence frames'
    )

    return rgbs, prompt_depths, confs, input_dir

def main (
        input_path: str = 'data/8b98276b0a.zip',
        output_path: str = 'data/8b98276b0a_results',
        max_size: int = 1008,
):
    """
    Process scene data and generate point clouds by applying extrinsic transformations from odometry,
    while incorporating confidence frames for depth filtering.

    This function performs the following steps:
      1. Loads RGB, depth, and confidence data from the specified input path.
      2. Loads the camera intrinsics from 'camera_matrix.csv'.
      3. Loads odometry data from 'odometry.csv'. The odometry file is expected to have a header row,
         and each subsequent row contains 9 values in the order:
           [timestamp, frame, x, y, z, qx, qy, qz, qw].
         The first two columns are dropped, and the remaining values are used to create a 4x4 extrinsic matrix.
      4. For each frame:
         - Performs depth prediction using the PromptDA model.
         - Saves the predicted depth map.
         - Adjusts the intrinsic matrix based on the current frame size.
         - Retrieves the corresponding confidence frame.
         - Applies the corresponding extrinsic transformation (from odometry) to unproject the depth map.
         - Generates and exports a point cloud for the frame.

    Parameters:
      input_path (str): Path to the input ZIP file containing the scene data.
      output_path (str): Directory where the output depth maps and point clouds will be saved.
      max_size (int): Maximum size for resizing images.
    """
    # Create the output directory if it doesn't exist.
    os.makedirs(output_path, exist_ok=True)

    # Load RGB, depth, and confidence images along with the directory where data is stored.
    rgbs, prompt_depths, confs, input_dir = load_data(input_path, max_size)

    # Load camera intrinsics from CSV.
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    ixt = np.loadtxt(ixt_path, delimiter=',')
    orig_max = 1920

    # Load odometry data from CSV.
    # Skip the header row and expect each row to have 9 values:
    # [timestamp, frame, x, y, z, qx, qy, qz, qw]
    odometry_path = os.path.join(input_dir, 'odometry.csv')
    odometry_data = np.loadtxt(odometry_path, delimiter=',', skiprows=1)
    num_frames_odometry = odometry_data.shape[0]

    DEVICE = 'cuda'
    model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

    # Process each frame.
    for frame_idx, (rgb, prompt_depth, conf) in tqdm(enumerate(zip(rgbs, prompt_depths, confs)), desc='Inferring', total=len(rgbs)):
        rgb, prompt_depth, conf = rgb.to(DEVICE), prompt_depth.to(DEVICE), conf.to(DEVICE)
        depth = model.predict(rgb, prompt_depth)

        # Save the predicted depth map.
        frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
        save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

        # Adjust the intrinsic matrix based on the current frame dimensions.
        now_max = max(rgb.shape[1], rgb.shape[0])
        scale = orig_max / now_max
        ixt_frame = ixt.copy()
        ixt_frame[:2] = ixt_frame[:2] / scale

        # Retrieve and convert the corresponding odometry data into a 4x4 extrinsic matrix.
        if frame_idx < num_frames_odometry:
            # Drop the first two columns (timestamp and frame) and take the next 7 values.
            odom = odometry_data[frame_idx, 2:]  # odom is now [x, y, z, qx, qy, qz, qw]
            trans = odom[:3]               # Translation vector (x, y, z)
            quat = odom[3:]                # Quaternion (qx, qy, qz, qw)
            # Convert quaternion to a rotation matrix.
            rot = R.from_quat(quat).as_matrix()
            # Construct the 4x4 extrinsic matrix.
            ext = np.eye(4)
            ext[:3, :3] = rot
            ext[:3, 3] = trans
        else:
            ext = np.eye(4)

        # Convert the predicted depth and corresponding confidence to NumPy arrays.
        depth_np = depth[0, 0].detach().cpu().numpy()
        conf_np = conf[0, 0].detach().cpu().numpy()

        # Unproject the depth map into a 3D point cloud using the confidence frame.
        points, colors = unproject_depth(depth_np, ixt=ixt_frame, color=rgb, ext=ext, ret_pcd=False, conf=conf_np)

        # If colors are returned as tensors, convert them to numpy arrays.
        if hasattr(colors, "cpu"):
            colors = colors.cpu().numpy()

        # Create a point cloud using trimesh.
        pcd = trimesh.PointCloud(vertices=points, colors=colors)

        # Export the point cloud to a PLY file with a unique filename.
        ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
        pcd.export(ply_path)

if __name__ == "__main__":
    tyro.cli(main)

Explanation

Loading Confidence Frames:
- The load_data function now also looks for PNG files in the “confidence” folder.
- The list of confidence files is truncated so that the number of confidence frames matches the number of RGB and depth frames.
- The confidence frames are loaded in parallel (using load_depth, which is assumed to be suitable for these images).
Using Confidence Data in Unprojection:
- In the main processing loop, the corresponding confidence frame is retrieved for each frame.
- Both the predicted depth map and the confidence frame are detached from the GPU, moved to CPU, and converted to NumPy arrays.
- The unproject_depth function is called with the additional parameter conf=conf_np so that the confidence information can be used (for example, to filter out low-confidence depth values).

This version now loads and uses the confidence frames in the unprojection process, ensuring that the confidence information is taken into account when generating the point clouds.

This error indicates that the confidence frame’s array size doesn’t match the size of the predicted depth map that’s being used for unprojection. In our unprojection code, we generate a grid based on the depth map’s resolution (e.g. 256×192, which is 49152 elements), but your confidence frame (after reshaping) has 762048 elements. That means the confidence image’s resolution isn’t matching the depth image’s resolution.

A common fix is to explicitly resize the confidence frame to match the predicted depth’s resolution before using it. For example, you can use OpenCV’s resize (or another resizing function) to force the confidence frame to have the same width and height as the depth map.

Below is an updated snippet of the main loop that does this:

import cv2  # Make sure to import cv2

# Inside your frame processing loop:
depth_np = depth[0, 0].detach().cpu().numpy()
conf_np = conf[0, 0].detach().cpu().numpy()

# If the confidence frame resolution doesn't match the depth map, resize it.
if conf_np.shape != depth_np.shape:
    conf_np = cv2.resize(conf_np, (depth_np.shape[1], depth_np.shape[0]), interpolation=cv2.INTER_NEAREST)

This ensures that when unproject_depth creates a grid based on depth_np.shape, the flattened confidence array (from conf_np) has the same number of elements.

Below is the full updated script with the added resizing step:

import os
import glob
import numpy as np
import tyro
import trimesh
from tqdm.auto import tqdm
from scipy.spatial.transform import Rotation as R  # For converting quaternions to rotation matrices
import cv2  # For resizing confidence frames

from promptda.utils.io_wrapper import load_image, load_depth, save_depth
from promptda.utils.parallel_utils import parallel_execution
from promptda.promptda import PromptDA
from promptda.utils.depth_utils import unproject_depth  # Adjust the import as needed

def load_data(input_path: str, max_size: int):
    """
    Load scene data from a ZIP file, including RGB images, prompt depth maps, and confidence frames.
    """
    root_dir = os.path.dirname(input_path)
    scene_name = os.path.splitext(os.path.basename(input_path))[0]
    input_dir = os.path.join(root_dir, scene_name)

    # Unzip the file if the directory doesn't exist.
    if not os.path.exists(input_dir):
        cmd = f'unzip -o {input_path} -d {root_dir}'
        os.system(cmd)

    # Extract RGB frames from video if the 'rgb' directory doesn't exist.
    rgb_dir = os.path.join(input_dir, 'rgb')
    if not os.path.exists(rgb_dir):
        os.makedirs(rgb_dir, exist_ok=True)
        cmd = f'ffmpeg -i {input_dir}/rgb.mp4 -start_number 0 -q:v 2 {rgb_dir}/%06d.jpg'
        os.system(cmd)

    rgb_files = sorted(glob.glob(os.path.join(rgb_dir, '*.jpg')))
    depth_files = sorted(glob.glob(os.path.join(input_dir, 'depth', '*.png')))
    conf_dir = os.path.join(input_dir, 'confidence')
    confidence_files = sorted(glob.glob(os.path.join(conf_dir, '*.png')))

    # Ensure all three lists have the same number of files.
    min_len = min(len(rgb_files), len(depth_files), len(confidence_files))
    rgb_files = rgb_files[:min_len]
    depth_files = depth_files[:min_len]
    confidence_files = confidence_files[:min_len]

    rgbs = parallel_execution(
        rgb_files,
        to_tensor=True,
        max_size=max_size,
        action=load_image,
        num_processes=32,
        print_progress=True,
        desc='Loading RGB images'
    )

    prompt_depths = parallel_execution(
        depth_files,
        to_tensor=True,
        action=load_depth,
        num_processes=32,
        print_progress=True,
        desc='Loading Depth frames'
    )

    confs = parallel_execution(
        confidence_files,
        to_tensor=True,
        max_size=max_size,
        action=load_depth,  # Assumes confidence images are loaded similarly to depth images.
        num_processes=32,
        print_progress=True,
        desc='Loading Confidence frames'
    )

    return rgbs, prompt_depths, confs, input_dir

def main (
        input_path: str = 'data/8b98276b0a.zip',
        output_path: str = 'data/8b98276b0a_results',
        max_size: int = 1008,
):
    """
    Process scene data and generate point clouds by applying extrinsic transformations from odometry,
    while incorporating confidence frames for depth filtering.
    """
    os.makedirs(output_path, exist_ok=True)
    rgbs, prompt_depths, confs, input_dir = load_data(input_path, max_size)

    # Load camera intrinsics from CSV.
    ixt_path = os.path.join(input_dir, 'camera_matrix.csv')
    ixt = np.loadtxt(ixt_path, delimiter=',')
    orig_max = 1920

    # Load odometry data from CSV.
    # Skip the header row and expect each row to have 9 values:
    # [timestamp, frame, x, y, z, qx, qy, qz, qw]
    odometry_path = os.path.join(input_dir, 'odometry.csv')
    odometry_data = np.loadtxt(odometry_path, delimiter=',', skiprows=1)
    num_frames_odometry = odometry_data.shape[0]

    DEVICE = 'cuda'
    model = PromptDA.from_pretrained("depth-anything/prompt-depth-anything-vitl").to(DEVICE).eval()

    # Process each frame.
    for frame_idx, (rgb, prompt_depth, conf) in tqdm(enumerate(zip(rgbs, prompt_depths, confs)), desc='Inferring', total=len(rgbs)):
        rgb, prompt_depth, conf = rgb.to(DEVICE), prompt_depth.to(DEVICE), conf.to(DEVICE)
        depth = model.predict(rgb, prompt_depth)

        # Save the predicted depth map.
        frame_depth_path = os.path.join(output_path, f'{frame_idx:06d}.png')
        save_depth(depth.detach().cpu(), output_path=frame_depth_path, save_vis=True)

        now_max = max(rgb.shape[1], rgb.shape[0])
        scale = orig_max / now_max
        ixt_frame = ixt.copy()
        ixt_frame[:2] = ixt_frame[:2] / scale

        # Retrieve and convert the corresponding odometry data into a 4x4 extrinsic matrix.
        if frame_idx < num_frames_odometry:
            odom = odometry_data[frame_idx, 2:]  # [x, y, z, qx, qy, qz, qw]
            trans = odom[:3]
            quat = odom[3:]
            rot = R.from_quat(quat).as_matrix()
            ext = np.eye(4)
            ext[:3, :3] = rot
            ext[:3, 3] = trans
        else:
            ext = np.eye(4)

        # Convert predicted depth and confidence to NumPy arrays.
        depth_np = depth[0, 0].detach().cpu().numpy()
        conf_np = conf[0, 0].detach().cpu().numpy()

        # Resize confidence frame if needed.
        if conf_np.shape != depth_np.shape:
            conf_np = cv2.resize(conf_np, (depth_np.shape[1], depth_np.shape[0]), interpolation=cv2.INTER_NEAREST)

        # Unproject the depth map into a 3D point cloud using the confidence frame.
        points, colors = unproject_depth(depth_np, ixt=ixt_frame, color=rgb, ext=ext, ret_pcd=False, conf=conf_np)

        if hasattr(colors, "cpu"):
            colors = colors.cpu().numpy()

        pcd = trimesh.PointCloud(vertices=points, colors=colors)
        ply_path = os.path.join(input_dir, f'pointcloud_{frame_idx:06d}.ply')
        pcd.export(ply_path)

if __name__ == "__main__":
    tyro.cli(main)

Summary

Issue:
The confidence frame’s size didn’t match the depth map’s size, leading to a broadcasting error when reshaping and comparing arrays.
Solution:
We added a resize step (using OpenCV) to ensure that the confidence frame is resized to exactly match the predicted depth’s resolution before passing it to unproject_depth.

This update should fix the broadcasting error and ensure the confidence data is properly used in the unprojection process.