-
Notifications
You must be signed in to change notification settings - Fork 35
Home
Welcome to the FastGS wiki! Here we provide guidance on how to integrate FastGS into other algorithms. Please follow the tutorial patiently. We have already open-sourced our improved implementations of dynamic reconstruction and sparse-view reconstruction. You can also use them as references for your own modifications. If you encounter any issues, you can contact us at [email protected] .
We are currently exploring how to integrate the backward pass from Taming-3DGS into different algorithms. If you’re interested in this backward implementation, we’d be happy to discuss and learn from each other!
Thanks again to 3DGS, Taming-3DGS, Abs-GS and Speedy-Splat for their great contributions.
-
Please add fast_utils.py under the utils folder.
-
Function Import
# train.py
from gaussian_renderer import render_fastgs
from utils.fast_utils import compute_gaussian_score_fastgs, sampling_cameras- Insert opt.mult into every render_fastgs call, placing it immediately after bg:
# train.py
render_pkg = render_fastgs(viewpoint_cam, gaussians, pipe, bg, opt.mult)
training_report(tb_writer, iteration, Ll1, loss, l1_loss, iter_time, testing_iterations, scene, render_fastgs, (pipe, background, opt.mult))- Add VCD and VCP modules:
# train.py, during densification
if iteration > opt.densify_from_iter and iteration % opt.densification_interval == 0:
size_threshold = 20 if iteration > opt.opacity_reset_interval else None
my_viewpoint_stack = scene.getTrainCameras().copy()
camlist = sampling_cameras(my_viewpoint_stack)
# The multiview consistent densification of fastgs
importance_score, pruning_score = compute_gaussian_score_fastgs(camlist, gaussians, pipe, bg, opt, DENSIFY=True)
gaussians.densify_and_prune_fastgs(max_screen_size = size_threshold,
min_opacity = 0.005,
extent = scene.cameras_extent,
radii=radii,
args = opt,
importance_score = importance_score,
pruning_score = pruning_score)# train.py, after densification
if iteration % 3000 == 0 and iteration > 15_000 and iteration < 30_000: # after 15000th iteration
my_viewpoint_stack = scene.getTrainCameras().copy()
camlist = sampling_cameras(my_viewpoint_stack)
_, pruning_score = compute_gaussian_score_fastgs(camlist, gaussians, pipe, bg, opt)
gaussians.final_prune_fastgs(min_opacity = 0.1, pruning_score = pruning_score)# Add the following functions in gaussian_model.py:
def densify_and_split_fastgs(self, metric_mask, filter, N=2):
n_init_points = self.get_xyz.shape[0]
selected_pts_mask = torch.zeros((n_init_points), dtype=bool, device="cuda")
mask = torch.logical_and(metric_mask, filter)
selected_pts_mask[:mask.shape[0]] = mask
stds = self.get_scaling[selected_pts_mask].repeat(N,1)
means =torch.zeros((stds.size(0), 3),device="cuda")
samples = torch.normal(mean=means, std=stds)
rots = build_rotation(self._rotation[selected_pts_mask]).repeat(N,1,1)
new_xyz = torch.bmm(rots, samples.unsqueeze(-1)).squeeze(-1) + self.get_xyz[selected_pts_mask].repeat(N, 1)
new_scaling = self.scaling_inverse_activation(self.get_scaling[selected_pts_mask].repeat(N,1) / (0.8*N))
new_rotation = self._rotation[selected_pts_mask].repeat(N,1)
new_features_dc = self._features_dc[selected_pts_mask].repeat(N,1,1)
new_features_rest = self._features_rest[selected_pts_mask].repeat(N,1,1)
new_opacity = self._opacity[selected_pts_mask].repeat(N,1)
new_tmp_radii = self.tmp_radii[selected_pts_mask].repeat(N)
self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacity, new_scaling, new_rotation, new_tmp_radii)
prune_filter = torch.cat((selected_pts_mask, torch.zeros(N * selected_pts_mask.sum(), device="cuda", dtype=bool)))
self.prune_points(prune_filter)
def densify_and_clone_fastgs(self, metric_mask, filter):
selected_pts_mask = torch.logical_and(metric_mask, filter)
new_xyz = self._xyz[selected_pts_mask]
new_features_dc = self._features_dc[selected_pts_mask]
new_features_rest = self._features_rest[selected_pts_mask]
new_opacities = self._opacity[selected_pts_mask]
new_scaling = self._scaling[selected_pts_mask]
new_rotation = self._rotation[selected_pts_mask]
new_tmp_radii = self.tmp_radii[selected_pts_mask]
self.densification_postfix(new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_tmp_radii)
def densify_and_prune_fastgs(self, max_screen_size, min_opacity, extent, radii, args, importance_score = None, pruning_score = None):
'''
Densification and Pruning based on FastGS criteria:
1. The gaussians candidate for densification are selected based on the gradient of their position first.
2. Then, based on their average metric score (computed over multiple sampled views), they are either densified (cloned) or split.
This is our main contribution compared to the vanilla 3DGS.
3. Finally, gaussians with low opacity or very large size are pruned.
'''
grad_vars = self.xyz_gradient_accum / self.denom
grad_vars[grad_vars.isnan()] = 0.0
self.tmp_radii = radii
grads_abs = self.xyz_gradient_accum_abs / self.denom
grads_abs[grads_abs.isnan()] = 0.0
grad_qualifiers = torch.where(torch.norm(grad_vars, dim=-1) >= args.grad_thresh, True, False)
grad_qualifiers_abs = torch.where(torch.norm(grads_abs, dim=-1) >= args.grad_abs_thresh, True, False)
clone_qualifiers = torch.max(self.get_scaling, dim=1).values <= args.dense*extent
split_qualifiers = torch.max(self.get_scaling, dim=1).values > args.dense*extent
all_clones = torch.logical_and(clone_qualifiers, grad_qualifiers)
all_splits = torch.logical_and(split_qualifiers, grad_qualifiers_abs)
# This is our multi-view consisent metric for densification
# We use this metric to further filter the candidates for densification, which is similar to taming 3dgs.
metric_mask = importance_score > 5
self.densify_and_clone_fastgs(metric_mask, all_clones)
self.densify_and_split_fastgs(metric_mask, all_splits)
prune_mask = (self.get_opacity < min_opacity).squeeze()
if max_screen_size:
big_points_vs = self.max_radii2D > max_screen_size
big_points_ws = self.get_scaling.max(dim=1).values > 0.1 * extent
prune_mask = torch.logical_or(torch.logical_or(prune_mask, big_points_vs), big_points_ws)
scores = 1 - pruning_score
to_remove = torch.sum(prune_mask)
remove_budget = int(0.5 * to_remove)
# The budget is not necessary for our method.
if remove_budget:
n_init_points = self.get_xyz.shape[0]
padded_importance = torch.zeros((n_init_points), dtype=torch.float32)
padded_importance[:scores.shape[0]] = 1 / (1e-6 + scores.squeeze())
selected_pts_mask = torch.zeros_like(padded_importance, dtype=bool, device="cuda")
sampled_indices = torch.multinomial(padded_importance, remove_budget, replacement=False)
selected_pts_mask[sampled_indices] = True
final_prune = torch.logical_and(prune_mask, selected_pts_mask)
self.prune_points(final_prune)
opacities_new = inverse_sigmoid(torch.min(self.get_opacity, torch.ones_like(self.get_opacity)*0.8))
optimizable_tensors = self.replace_tensor_to_optimizer(opacities_new, "opacity")
self._opacity = optimizable_tensors["opacity"]
tmp_radii = self.tmp_radii
self.tmp_radii = None
torch.cuda.empty_cache()
def final_prune_fastgs(self, min_opacity, pruning_score = None):
"""Final-stage pruning: remove Gaussians based on opacity and multi-view consistency.
In the final stage we remove Gaussians that have low opacity or that are flagged by
our multi-view reconstruction consistency metric (provided as `pruning_score`)."""
prune_mask = (self.get_opacity < min_opacity).squeeze()
scores_mask = pruning_score > 0.9
final_prune = torch.logical_or(prune_mask, scores_mask)
self.prune_points(final_prune)- Modifications to the rendering pipeline:
# Please follow the parameter order of render_fastgs in our gaussian_renderer/__init__.py to add mult, get_flag=None, metric_map=None.
# Place mult after background/bg, and get_flag=None, metric_map=None at the end, for example:
def render_fastgs(viewpoint_camera, pc : GaussianModel, pipe, bg_color : torch.Tensor, mult, scaling_modifier = 1.0, override_color = None, get_flag=None, metric_map = None):# gaussian_renderer/__init__.py, modify screenspace_points to use absolute gradients from Abs-GS
# Adding absolute gradients is straightforward, and I will explain it at the end.
screenspace_points = torch.zeros((pc.get_xyz.shape[0], 4), dtype=pc.get_xyz.dtype, requires_grad=True, device="cuda") + 0# gaussian_renderer/__init__.py , add:
if metric_map==None:
metric_map=torch.zeros(int(viewpoint_camera.image_height)*int(viewpoint_camera.image_width), dtype=torch.int, device='cuda')
raster_settings = GaussianRasterizationSettings(
image_height=int(viewpoint_camera.image_height),
image_width=int(viewpoint_camera.image_width),
tanfovx=tanfovx,
tanfovy=tanfovy,
bg=bg_color,
scale_modifier=scaling_modifier,
viewmatrix=viewpoint_camera.world_view_transform,
projmatrix=viewpoint_camera.full_proj_transform,
sh_degree=pc.active_sh_degree,
campos=viewpoint_camera.camera_center,
mult = mult, # Here, please add
prefiltered=False,
debug=pipe.debug,
get_flag=get_flag, # Here, please add
metric_map = metric_map # Here, please add
)# Note that low-order and high-order SH coefficients are stored separately.
# We keep the SH optimization acceleration from Taming-3DGS and will explain how to modify it at the end.
if override_color is None:
if pipe.convert_SHs_python:
shs_view = pc.get_features.transpose(1, 2).view(-1, 3, (pc.max_sh_degree+1)**2)
dir_pp = (pc.get_xyz - viewpoint_camera.camera_center.repeat(pc.get_features.shape[0], 1))
dir_pp_normalized = dir_pp/dir_pp.norm(dim=1, keepdim=True)
sh2rgb = eval_sh(pc.active_sh_degree, shs_view, dir_pp_normalized)
colors_precomp = torch.clamp_min(sh2rgb + 0.5, 0.0)
else:
dc, shs = pc.get_features_dc, pc.get_features_rest # Here
else:
colors_precomp = override_color
# Add accum_metric_counts as output
rendered_image, radii, accum_metric_counts = rasterizer(
means3D = means3D,
means2D = means2D,
dc = dc, # Here
shs = shs, # Here
colors_precomp = colors_precomp,
opacities = opacity,
scales = scales,
rotations = rotations,
cov3D_precomp = cov3D_precomp)
# Add accum_metric_counts to the return values
return {"render": rendered_image,
"viewspace_points": screenspace_points,
"visibility_filter" : (radii > 0).nonzero(),
"radii": radii,
"accum_metric_counts" : accum_metric_counts # Here, please add
}6.Modify the underlying rasterization rendering. In the following code, we show the parts that need to be added, marked with '# Here, please add'.
# submoudles/diff-gaussian-.../diff-gaussian-.../__init__.py
class GaussianRasterizationSettings(NamedTuple):
image_height: int
image_width: int
tanfovx : float
tanfovy : float
bg : torch.Tensor
scale_modifier : float
viewmatrix : torch.Tensor
projmatrix : torch.Tensor
sh_degree : int
campos : torch.Tensor
mult : float # Here, please add
prefiltered : bool
debug : bool
get_flag : bool # Here, please add
metric_map : torch.Tensor # Here, please add
class _RasterizeGaussians(torch.autograd.Function):
@staticmethod
def forward(
ctx,
means3D,
means2D,
dc,
sh,
colors_precomp,
opacities,
scales,
rotations,
cov3Ds_precomp,
raster_settings
):
# Restructure arguments the way that the C++ lib expects them
get_flag = raster_settings.get_flag # Here, please add
if get_flag == None: # Here, please add
get_flag = False # Here, please add
args = (
raster_settings.bg,
means3D,
colors_precomp,
opacities,
scales,
rotations,
raster_settings.scale_modifier,
cov3Ds_precomp,
raster_settings.metric_map, # Here, please add
raster_settings.viewmatrix,
raster_settings.projmatrix,
raster_settings.tanfovx,
raster_settings.tanfovy,
raster_settings.image_height,
raster_settings.image_width,
dc,
sh,
raster_settings.sh_degree,
raster_settings.campos,
raster_settings.mult, # Here, please add
raster_settings.prefiltered,
raster_settings.debug,
get_flag # Here, please add
)
# Invoke C++/CUDA rasterizer
if raster_settings.debug:
cpu_args = cpu_deep_copy_tuple(args) # Copy them before they can be corrupted
try:
num_rendered, num_buckets, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
except Exception as ex:
torch.save(cpu_args, "snapshot_fw.dump")
print("\nAn error occured in forward. Please forward snapshot_fw.dump for debugging.")
raise ex
else:
# Here, please add accum_metric_counts as output
num_rendered, num_buckets, color, radii, geomBuffer, binningBuffer, imgBuffer, sampleBuffer, accum_metric_counts = _C.rasterize_gaussians(*args)
# Keep relevant tensors for backward
ctx.raster_settings = raster_settings
ctx.num_rendered = num_rendered
ctx.num_buckets = num_buckets
ctx.save_for_backward(colors_precomp, means3D, scales, rotations, cov3Ds_precomp, radii, dc, sh, geomBuffer, binningBuffer, imgBuffer, sampleBuffer)
return color, radii, accum_metric_counts # Here, please add accum_metric_counts
@staticmethod
def backward(ctx, grad_out_color, _, g_metric): Here, please add g_metric# rasterize_points.h
std::tuple<int, int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
RasterizeGaussiansCUDA( # Here, please add ', torch::Tensor, torch::Tensor, torch::Tensor'
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& colors,
const torch::Tensor& opacity,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& metric_map, # Here, please add
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const int image_height,
const int image_width,
const torch::Tensor& dc,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos,
const float mult, # Here, please add
const bool prefiltered,
const bool debug,
const bool get_flag # Here, please add
);
# rasterize_points.cu
std::tuple<int, int, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
RasterizeGaussiansCUDA( # Here, please add ', torch::Tensor, torch::Tensor, torch::Tensor'
const torch::Tensor& background,
const torch::Tensor& means3D,
const torch::Tensor& colors,
const torch::Tensor& opacity,
const torch::Tensor& scales,
const torch::Tensor& rotations,
const float scale_modifier,
const torch::Tensor& cov3D_precomp,
const torch::Tensor& metric_map, # Here, please add
const torch::Tensor& viewmatrix,
const torch::Tensor& projmatrix,
const float tan_fovx,
const float tan_fovy,
const int image_height,
const int image_width,
const torch::Tensor& dc,
const torch::Tensor& sh,
const int degree,
const torch::Tensor& campos,
const float mult, # Here, please add
const bool prefiltered,
const bool debug,
const bool get_flag # Here, please add
)
{
if (means3D.ndimension() != 2 || means3D.size(1) != 3) {
AT_ERROR("means3D must have dimensions (num_points, 3)");
}
const int P = means3D.size(0);
const int H = image_height;
const int W = image_width;
auto int_opts = means3D.options().dtype(torch::kInt32);
auto float_opts = means3D.options().dtype(torch::kFloat32);
torch::Tensor out_color = torch::full({NUM_CHAFFELS, H, W}, 0.0, float_opts);
torch::Tensor radii = torch::full({P}, 0, means3D.options().dtype(torch::kInt32));
torch::Device device(torch::kCUDA);
torch::TensorOptions options(torch::kByte);
torch::Tensor geomBuffer = torch::empty({0}, options.device(device));
torch::Tensor binningBuffer = torch::empty({0}, options.device(device));
torch::Tensor imgBuffer = torch::empty({0}, options.device(device));
torch::Tensor sampleBuffer = torch::empty({0}, options.device(device));
std::function<char*(size_t)> geomFunc = resizeFunctional(geomBuffer);
std::function<char*(size_t)> binningFunc = resizeFunctional(binningBuffer);
std::function<char*(size_t)> imgFunc = resizeFunctional(imgBuffer);
std::function<char*(size_t)> sampleFunc = resizeFunctional(sampleBuffer);
int* accum_metric_counts_ptr = nullptr; # Here, please add
torch::Tensor metricCount = torch::empty({0}, int_opts); # Here, please add
if(get_flag) # Here, please add
{ # Here, please add
metricCount = torch::full({P}, 0, int_opts); # Here, please add
accum_metric_counts_ptr = metricCount.contiguous().data<int>(); # Here, please add
} # Here, please add
int rendered = 0;
int num_buckets = 0;
if(P != 0)
{
int M = 0;
if(sh.size(0) != 0)
{
M = sh.size(1);
}
auto tup = CudaRasterizer::Rasterizer::forward(
geomFunc,
binningFunc,
imgFunc,
sampleFunc,
P, degree, M,
background.contiguous().data<float>(),
W, H,
means3D.contiguous().data<float>(),
dc.contiguous().data_ptr<float>(),
sh.contiguous().data_ptr<float>(),
colors.contiguous().data<float>(),
opacity.contiguous().data<float>(),
scales.contiguous().data_ptr<float>(),
scale_modifier,
rotations.contiguous().data_ptr<float>(),
cov3D_precomp.contiguous().data<float>(),
metric_map.contiguous().data<int>(), # Here, please add
viewmatrix.contiguous().data<float>(),
projmatrix.contiguous().data<float>(),
campos.contiguous().data<float>(),
mult,
tan_fovx,
tan_fovy,
prefiltered,
out_color.contiguous().data<float>(),
radii.contiguous().data<int>(),
debug,
get_flag, # Here, please add
accum_metric_counts_ptr); # Here, please add
rendered = std::get<0>(tup);
num_buckets = std::get<1>(tup);
}
return std::make_tuple(rendered, num_buckets, out_color, radii, geomBuffer, binningBuffer, imgBuffer, sampleBuffer, metricCount); # Here, please add metricCount
}# cuda_rasterizer/rasterizer.h
static std::tuple<int,int> forward(
std::function<char* (size_t)> geometryBuffer,
std::function<char* (size_t)> binningBuffer,
std::function<char* (size_t)> imageBuffer,
std::function<char* (size_t)> sampleBuffer,
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* dc,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const int* metric_map, # Here, please add
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float mult, # Here, please add
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii = nullptr,
bool debug = false,
bool get_flag = false, # Here, please add
int* metricCount = nullptr); # Here, please add
# cuda_rasterizer/rasterizer_impl.cu
# Replace the original `duplicateWithKeys` function with the following `duplicateWithKeys`.
__global__ void duplicateWithKeys(
int P,
const float mult,
const float2* points_xy,
const float* depths,
const uint32_t* offsets,
uint64_t* gaussian_keys_unsorted,
uint32_t* gaussian_values_unsorted,
float4* con_o,
uint32_t* tiles_touched,
dim3 grid)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Generate no key/value pair for invisible Gaussians
if (tiles_touched[idx] > 0)
{
// Find this Gaussian's offset in buffer for writing keys/values.
uint32_t off = (idx == 0) ? 0 : offsets[idx - 1];
// Update unsorted arrays with Gaussian idx for every tile that
// Gaussian touches
duplicateToTilesTouched(
points_xy[idx], con_o[idx], grid, mult,
idx, off, depths[idx],
gaussian_keys_unsorted,
gaussian_values_unsorted);
}
}
std::tuple<int,int> CudaRasterizer::Rasterizer::forward(
std::function<char* (size_t)> geometryBuffer,
std::function<char* (size_t)> binningBuffer,
std::function<char* (size_t)> imageBuffer,
std::function<char* (size_t)> sampleBuffer,
const int P, int D, int M,
const float* background,
const int width, int height,
const float* means3D,
const float* dc,
const float* shs,
const float* colors_precomp,
const float* opacities,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const int* metric_map, # Here, please add
const float* viewmatrix,
const float* projmatrix,
const float* cam_pos,
const float mult, # Here, please add
const float tan_fovx, float tan_fovy,
const bool prefiltered,
float* out_color,
int* radii,
bool debug,
bool get_flag, # Here, please add
int* metricCount) # Here, please add
{
const float focal_y = height / (2.0f * tan_fovy);
const float focal_x = width / (2.0f * tan_fovx);
size_t chunk_size = required<GeometryState>(P);
char* chunkptr = geometryBuffer(chunk_size);
GeometryState geomState = GeometryState::fromChunk(chunkptr, P);
if (radii == nullptr)
{
radii = geomState.internal_radii;
}
dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
dim3 block(BLOCK_X, BLOCK_Y, 1);
// Dynamically resize image-based auxiliary buffers during training
size_t img_chunk_size = required<ImageState>(width * height);
char* img_chunkptr = imageBuffer(img_chunk_size);
ImageState imgState = ImageState::fromChunk(img_chunkptr, width * height);
if (NUM_CHAFFELS != 3 && colors_precomp == nullptr)
{
throw std::runtime_error("For non-RGB, provide precomputed Gaussian colors!");
}
// Run preprocessing per-Gaussian (transformation, bounding, conversion of SHs to RGB)
CHECK_CUDA(FORWARD::preprocess(
P, D, M,
means3D,
(glm::vec3*)scales,
scale_modifier,
(glm::vec4*)rotations,
opacities,
dc,
shs,
geomState.clamped,
cov3D_precomp,
colors_precomp,
viewmatrix, projmatrix,
(glm::vec3*)cam_pos,
mult, # Here, please add
width, height,
focal_x, focal_y,
tan_fovx, tan_fovy,
radii,
geomState.means2D,
geomState.depths,
geomState.cov3D,
geomState.rgb,
geomState.conic_opacity,
tile_grid,
geomState.tiles_touched,
prefiltered
), debug)
// Compute prefix sum over full list of touched tile counts by Gaussians
// E.g., [2, 3, 0, 2, 1] -> [2, 5, 5, 7, 8]
CHECK_CUDA(cub::DeviceScan::InclusiveSum(geomState.scanning_space, geomState.scan_size, geomState.tiles_touched, geomState.point_offsets, P), debug)
// Retrieve total number of Gaussian instances to launch and resize aux buffers
int num_rendered;
CHECK_CUDA(cudaMemcpy(&num_rendered, geomState.point_offsets + P - 1, sizeof(int), cudaMemcpyDeviceToHost), debug);
size_t binning_chunk_size = required<BinningState>(num_rendered);
char* binning_chunkptr = binningBuffer(binning_chunk_size);
BinningState binningState = BinningState::fromChunk(binning_chunkptr, num_rendered);
// For each instance to be rendered, produce adequate [ tile | depth ] key
// and corresponding dublicated Gaussian indices to be sorted
duplicateWithKeys << <(P + 255) / 256, 256 >> > ( # Here, please add (Directly replace)
P, mult,
geomState.means2D,
geomState.depths,
geomState.point_offsets,
binningState.point_list_keys_unsorted,
binningState.point_list_unsorted,
geomState.conic_opacity,
geomState.tiles_touched,
tile_grid)
CHECK_CUDA(, debug)
int bit = getHigherMsb(tile_grid.x * tile_grid.y);
// Sort complete list of (duplicated) Gaussian indices by keys
CHECK_CUDA(cub::DeviceRadixSort::SortPairs(
binningState.list_sorting_space,
binningState.sorting_size,
binningState.point_list_keys_unsorted, binningState.point_list_keys,
binningState.point_list_unsorted, binningState.point_list,
num_rendered, 0, 32 + bit), debug)
CHECK_CUDA(cudaMemset(imgState.ranges, 0, tile_grid.x * tile_grid.y * sizeof(uint2)), debug);
// Identify start and end of per-tile workloads in sorted list
if (num_rendered > 0)
identifyTileRanges << <(num_rendered + 255) / 256, 256 >> > (
num_rendered,
binningState.point_list_keys,
imgState.ranges);
CHECK_CUDA(, debug)
// bucket count
int num_tiles = tile_grid.x * tile_grid.y;
perTileBucketCount<<<(num_tiles + 255) / 256, 256>>>(num_tiles, imgState.ranges, imgState.bucket_count);
CHECK_CUDA(cub::DeviceScan::InclusiveSum(imgState.bucket_count_scanning_space, imgState.bucket_count_scan_size, imgState.bucket_count, imgState.bucket_offsets, num_tiles), debug)
unsigned int bucket_sum;
CHECK_CUDA(cudaMemcpy(&bucket_sum, imgState.bucket_offsets + num_tiles - 1, sizeof(unsigned int), cudaMemcpyDeviceToHost), debug);
// create a state to store. size is number is the total number of buckets * block_size
size_t sample_chunk_size = required<SampleState>(bucket_sum);
char* sample_chunkptr = sampleBuffer(sample_chunk_size);
SampleState sampleState = SampleState::fromChunk(sample_chunkptr, bucket_sum);
// Let each tile blend its range of Gaussians independently in parallel
const float* feature_ptr = colors_precomp != nullptr ? colors_precomp : geomState.rgb;
CHECK_CUDA(FORWARD::render(
tile_grid, block,
imgState.ranges,
binningState.point_list,
imgState.bucket_offsets, sampleState.bucket_to_tile,
sampleState.T, sampleState.ar,
width, height,
geomState.means2D,
feature_ptr,
geomState.conic_opacity,
imgState.accum_alpha,
imgState.n_contrib,
imgState.max_contrib,
imgState.pixel_colors,
background,
out_color,
imgState.contrib_scan,
imgState.scan_size,
radii,
metric_map, # Here, please add
get_flag, # Here, please add
metricCount # Here, please add
), debug)
return std::make_tuple(num_rendered, bucket_sum);
}# cuda_rasterizer/forward.h
void preprocess(
int P, int D, int M,
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* dc,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const float mult, # Here, please add
const int W, int H,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
int* radii,
float2* points_xy_image,
float* depths,
float* cov3Ds,
float* colors,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered);
// Main rasterization method.
void render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
const uint32_t* per_tile_bucket_offset, uint32_t* bucket_to_tile,
float* sampled_T, float* sampled_ar,
int W, int H,
const float2* points_xy_image,
const float* features,
const float4* conic_opacity,
float* final_T,
uint32_t* n_contrib,
uint32_t* max_contrib,
float* pixel_colors,
const float* bg_color,
float* out_color,
char* img_contrib_scan,
size_t scan_size,
int* radii,
const int* metric_map, # Here, please add
bool get_flag, # Here, please add
int* metricCount); # Here, please add
# cuda_rasterizer/forward.cu
// Perform initial steps for each Gaussian prior to rasterization.
template<int C>
__global__ void preprocessCUDA(
int P, int D, int M,
const float* orig_points,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* dc,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const float mult, # Here, please add
const int W, int H,
const float tan_fovx, float tan_fovy,
const float focal_x, float focal_y,
int* radii,
float2* points_xy_image,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
auto idx = cg::this_grid().thread_rank();
if (idx >= P)
return;
// Initialize radius and touched tiles to 0. If this isn't changed,
// this Gaussian will not be processed further.
radii[idx] = 0;
tiles_touched[idx] = 0;
// Perform near culling, quit if outside.
float3 p_view;
if (!in_frustum(idx, orig_points, viewmatrix, projmatrix, prefiltered, p_view))
return;
// Transform point by projecting
float3 p_orig = { orig_points[3 * idx], orig_points[3 * idx + 1], orig_points[3 * idx + 2] };
float4 p_hom = transformPoint4x4(p_orig, projmatrix);
float p_w = 1.0f / (p_hom.w + 0.0000001f);
float3 p_proj = { p_hom.x * p_w, p_hom.y * p_w, p_hom.z * p_w };
// If 3D covariance matrix is precomputed, use it, otherwise compute
// from scaling and rotation parameters.
const float* cov3D;
if (cov3D_precomp != nullptr)
{
cov3D = cov3D_precomp + idx * 6;
}
else
{
computeCov3D(scales[idx], scale_modifier, rotations[idx], cov3Ds + idx * 6);
cov3D = cov3Ds + idx * 6;
}
// Compute 2D screen-space covariance matrix
float3 cov = computeCov2D(p_orig, focal_x, focal_y, tan_fovx, tan_fovy, cov3D, viewmatrix);
// Invert covariance (EWA algorithm)
float det = (cov.x * cov.z - cov.y * cov.y);
if (det == 0.0f)
return;
float det_inv = 1.f / det;
float3 conic = { cov.z * det_inv, -cov.y * det_inv, cov.x * det_inv };
// Compute extent in screen space (by finding eigenvalues of
// 2D covariance matrix). Use extent to compute a bounding rectangle
// of screen-space tiles that this Gaussian overlaps with. Quit if
// rectangle covers 0 tiles.
float mid = 0.5f * (cov.x + cov.z);
float lambda1 = mid + sqrt(max(0.1f, mid * mid - det));
float lambda2 = mid - sqrt(max(0.1f, mid * mid - det));
float my_radius = ceil(3.f * sqrt(max(lambda1, lambda2)));
float2 point_image = { ndc2Pix(p_proj.x, W), ndc2Pix(p_proj.y, H) };
float4 con_o = { conic.x, conic.y, conic.z, opacities[idx] }; # Here, please add
// Only counts tiles touched when nullptr is passed as array argment.
uint32_t tiles_count = duplicateToTilesTouched(point_image, con_o, grid, mult, 0, 0, 0, nullptr, nullptr); # Here, please add
if (tiles_count == 0) # Here, please add
return; # Here, please add
// If colors have been precomputed, use them, otherwise convert
// spherical harmonics coefficients to RGB color.
if (colors_precomp == nullptr)
{
glm::vec3 result = computeColorFromSH(idx, D, M, (glm::vec3*)orig_points, *cam_pos, dc, shs, clamped);
rgb[idx * C + 0] = result.x;
rgb[idx * C + 1] = result.y;
rgb[idx * C + 2] = result.z;
}
// Store some useful helper data for the next steps.
depths[idx] = p_view.z;
radii[idx] = my_radius;
points_xy_image[idx] = point_image;
// Inverse 2D covariance and opacity neatly pack into one float4
conic_opacity[idx] = con_o; # Here, please add
tiles_touched[idx] = tiles_count; # Here, please add
}
// Main rasterization method. Collaboratively works on one tile per
// block, each thread treats one pixel. Alternates between fetching
// and rasterizing data.
template <uint32_t CHANNELS>
__global__ void __launch_bounds__(BLOCK_X * BLOCK_Y)
renderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
const uint32_t* __restrict__ per_tile_bucket_offset, uint32_t* __restrict__ bucket_to_tile,
float* __restrict__ sampled_T, float* __restrict__ sampled_ar,
int W, int H,
const float2* __restrict__ points_xy_image,
const float* __restrict__ features,
const float4* __restrict__ conic_opacity,
float* __restrict__ final_T,
uint32_t* __restrict__ n_contrib,
uint32_t* __restrict__ max_contrib,
float* __restrict__ pixel_colors,
const float* __restrict__ bg_color,
float* __restrict__ out_color,
int* __restrict__ radii,
const int* __restrict__ metric_map, # Here, please add
bool get_flag, # Here, please add
int* __restrict__ metricCount) # Here, please add
{
// Identify current tile and associated min/max pixel range.
auto block = cg::this_thread_block();
uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
uint2 pix_min = { block.group_index().x * BLOCK_X, block.group_index().y * BLOCK_Y };
uint2 pix_max = { min(pix_min.x + BLOCK_X, W), min(pix_min.y + BLOCK_Y , H) };
uint2 pix = { pix_min.x + block.thread_index().x, pix_min.y + block.thread_index().y };
uint32_t pix_id = W * pix.y + pix.x;
float2 pixf = { (float)pix.x, (float)pix.y };
// Check if this thread is associated with a valid pixel or outside.
bool inside = pix.x < W&& pix.y < H;
// Done threads can help with fetching, but don't rasterize
bool done = !inside;
// Load start/end range of IDs to process in bit sorted list.
uint32_t tile_id = block.group_index().y * horizontal_blocks + block.group_index().x;
uint2 range = ranges[tile_id];
const int rounds = ((range.y - range.x + BLOCK_SIZE - 1) / BLOCK_SIZE);
int toDo = range.y - range.x;
// what is the number of buckets before me? what is my offset?
uint32_t bbm = tile_id == 0 ? 0 : per_tile_bucket_offset[tile_id - 1];
// let's first quickly also write the bucket-to-tile mapping
int num_buckets = (toDo + 31) / 32;
for (int i = 0; i < (num_buckets + BLOCK_SIZE - 1) / BLOCK_SIZE; ++i) {
int bucket_idx = i * BLOCK_SIZE + block.thread_rank();
if (bucket_idx < num_buckets) {
bucket_to_tile[bbm + bucket_idx] = tile_id;
}
}
// Allocate storage for batches of collectively fetched data.
__shared__ int collected_id[BLOCK_SIZE];
__shared__ float2 collected_xy[BLOCK_SIZE];
__shared__ float4 collected_conic_opacity[BLOCK_SIZE];
// __shared__ float collected_radius2[BLOCK_SIZE];
// Initialize helper variables
float T = 1.0f;
uint32_t contributor = 0;
uint32_t last_contributor = 0;
float C[CHANNELS] = { 0 };
int contribs = 0;
// Iterate over batches until all done or range is complete
for (int i = 0; i < rounds; i++, toDo -= BLOCK_SIZE)
{
// End if entire block votes that it is done rasterizing
int num_done = __syncthreads_count(done);
if (num_done == BLOCK_SIZE)
break;
// Collectively fetch per-Gaussian data from global to shared
int progress = i * BLOCK_SIZE + block.thread_rank();
if (range.x + progress < range.y)
{
int coll_id = point_list[range.x + progress];
collected_id[block.thread_rank()] = coll_id;
collected_xy[block.thread_rank()] = points_xy_image[coll_id];
collected_conic_opacity[block.thread_rank()] = conic_opacity[coll_id];
// collected_radius2[block.thread_rank()] = radii[coll_id] * radii[coll_id];
}
block.sync();
// Iterate over current batch
for (int j = 0; !done && j < min(BLOCK_SIZE, toDo); j++)
{
// add incoming T value for every 32nd gaussian
if (j % 32 == 0) {
sampled_T[(bbm * BLOCK_SIZE) + block.thread_rank()] = T;
for (int ch = 0; ch < CHANNELS; ++ch) {
sampled_ar[(bbm * BLOCK_SIZE * CHANNELS) + ch * BLOCK_SIZE + block.thread_rank()] = C[ch];
}
++bbm;
}
// Keep track of current position in range
contributor++;
// Resample using conic matrix (cf. "Surface
// Splatting" by Zwicker et al., 2001)
float2 xy = collected_xy[j];
float2 d = { xy.x - pixf.x, xy.y - pixf.y };
float4 con_o = collected_conic_opacity[j];
float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f)
continue;
// Eq. (2) from 3D Gaussian splatting paper.
// Obtain alpha by multiplying with Gaussian opacity
// and its exponential falloff from mean.
// Avoid numerical instabilities (see paper appendix).
float alpha = min(0.99f, con_o.w * exp(power));
if (alpha < 1.0f / 255.0f)
continue;
float test_T = T * (1 - alpha);
if (test_T < 0.0001f)
{
done = true;
continue;
}
// Eq. (3) from 3D Gaussian splatting paper.
for (int ch = 0; ch < CHANNELS; ch++)
C[ch] += features[collected_id[j] * CHANNELS + ch] * alpha * T;
if(get_flag) # Here, please add
{
if(metric_map[pix_id] == 1) # Here, please add
{
atomicAdd(&(metricCount[collected_id[j]]), 1); # Here, please add
}
}
T = test_T;
// Keep track of last range entry to update this
// pixel.
last_contributor = contributor;
contribs++;
}
}
// All threads that treat valid pixel write out their final
// rendering data to the frame and auxiliary buffers.
if (inside)
{
final_T[pix_id] = T;
n_contrib[pix_id] = last_contributor;
for (int ch = 0; ch < CHANNELS; ch++)
{
pixel_colors[ch * H * W + pix_id] = C[ch];
out_color[ch * H * W + pix_id] = C[ch] + T * bg_color[ch];
}
}
// max reduce the last contributor
typedef cub::BlockReduce<uint32_t, BLOCK_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_Y> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
last_contributor = BlockReduce(temp_storage).Reduce(last_contributor, cub::Max());
if (block.thread_rank() == 0) {
max_contrib[tile_id] = last_contributor;
}
}
void FORWARD::render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
const uint32_t* per_tile_bucket_offset, uint32_t* bucket_to_tile,
float* sampled_T, float* sampled_ar,
int W, int H,
const float2* means2D,
const float* colors,
const float4* conic_opacity,
float* final_T,
uint32_t* n_contrib,
uint32_t* max_contrib,
float* pixel_colors,
const float* bg_color,
float* out_color,
char* img_contrib_scan,
size_t scan_size,
int* radii,
const int* metric_map, # Here, please add
bool get_flag, # Here, please add
int* metricCount) # Here, please add
{
renderCUDA<NUM_CHAFFELS> << <grid, block >> > (
ranges,
point_list,
per_tile_bucket_offset, bucket_to_tile,
sampled_T, sampled_ar,
W, H,
means2D,
colors,
conic_opacity,
final_T,
n_contrib,
max_contrib,
pixel_colors,
bg_color,
out_color,
radii,
metric_map, # Here, please add
get_flag, # Here, please add
metricCount # Here, please add
);
}
void FORWARD::preprocess(
int P, int D, int M,
const float* means3D,
const glm::vec3* scales,
const float scale_modifier,
const glm::vec4* rotations,
const float* opacities,
const float* dc,
const float* shs,
bool* clamped,
const float* cov3D_precomp,
const float* colors_precomp,
const float* viewmatrix,
const float* projmatrix,
const glm::vec3* cam_pos,
const float mult, # Here, please add
const int W, int H,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
int* radii,
float2* means2D,
float* depths,
float* cov3Ds,
float* rgb,
float4* conic_opacity,
const dim3 grid,
uint32_t* tiles_touched,
bool prefiltered)
{
preprocessCUDA<NUM_CHAFFELS> << <(P + 255) / 256, 256 >> > (
P, D, M,
means3D,
scales,
scale_modifier,
rotations,
opacities,
dc,
shs,
clamped,
cov3D_precomp,
colors_precomp,
viewmatrix,
projmatrix,
cam_pos,
mult, # Here, please add
W, H,
tan_fovx, tan_fovy,
focal_x, focal_y,
radii,
means2D,
depths,
cov3Ds,
rgb,
conic_opacity,
grid,
tiles_touched,
prefiltered);
}# submoudles/diff-gaussian-.../cuda_rasterizer/auxiliary.h
# Here, Please directly add all the code below.
#include <stdint.h>
__device__ inline float2 computeEllipseIntersection(
const float4 con_o, const float disc, const float t, const float2 p,
const bool isY, const float coord)
{
float p_u = isY ? p.y : p.x;
float p_v = isY ? p.x : p.y;
float coeff = isY ? con_o.x : con_o.z;
float h = coord - p_u; // h = y - p.y for y, x - p.x for x
float sqrt_term = sqrt(disc * h * h + t * coeff);
return {
(-con_o.y * h - sqrt_term) / coeff + p_v,
(-con_o.y * h + sqrt_term) / coeff + p_v
};
}
__device__ inline uint32_t processTiles(
const float4 con_o, const float disc, const float t, const float2 p,
float2 bbox_min, float2 bbox_max,
float2 bbox_argmin, float2 bbox_argmax,
int2 rect_min, int2 rect_max,
const dim3 grid, const bool isY,
uint32_t idx, uint32_t off, float depth,
uint64_t* gaussian_keys_unsorted,
uint32_t* gaussian_values_unsorted
)
{
// ---- AccuTile Code ---- //
// Set variables based on the isY flag
float BLOCK_U = isY ? BLOCK_Y : BLOCK_X;
float BLOCK_V = isY ? BLOCK_X : BLOCK_Y;
if (isY) {
rect_min = {rect_min.y, rect_min.x};
rect_max = {rect_max.y, rect_max.x};
bbox_min = {bbox_min.y, bbox_min.x};
bbox_max = {bbox_max.y, bbox_max.x};
bbox_argmin = {bbox_argmin.y, bbox_argmin.x};
bbox_argmax = {bbox_argmax.y, bbox_argmax.x};
}
uint32_t tiles_count = 0;
float2 intersect_min_line, intersect_max_line;
float ellipse_min, ellipse_max;
float min_line, max_line;
// Initialize max line
// Just need the min to be >= all points on the ellipse
// and max to be <= all points on the ellipse
intersect_max_line = {bbox_max.y, bbox_min.y};
min_line = rect_min.x * BLOCK_U;
// Initialize min line intersections.
if (bbox_min.x <= min_line) {
// Boundary case
intersect_min_line = computeEllipseIntersection(
con_o, disc, t, p, isY, rect_min.x * BLOCK_U);
} else {
// Same as max line
intersect_min_line = intersect_max_line;
}
// Loop over either y slices or x slices based on the `isY` flag.
for (int u = rect_min.x; u < rect_max.x; ++u)
{
// Starting from the bottom or left, we will only need to compute
// intersections at the next line.
max_line = min_line + BLOCK_U;
if (max_line <= bbox_max.x) {
intersect_max_line = computeEllipseIntersection(
con_o, disc, t, p, isY, max_line);
}
// If the bbox min is in this slice, then it is the minimum
// ellipse point in this slice. Otherwise, the minimum ellipse
// point will be the minimum of the intersections of the min/max lines.
if (min_line <= bbox_argmin.y && bbox_argmin.y < max_line) {
ellipse_min = bbox_min.y;
} else {
ellipse_min = min(intersect_min_line.x, intersect_max_line.x);
}
// If the bbox max is in this slice, then it is the maximum
// ellipse point in this slice. Otherwise, the maximum ellipse
// point will be the maximum of the intersections of the min/max lines.
if (min_line <= bbox_argmax.y && bbox_argmax.y < max_line) {
ellipse_max = bbox_max.y;
} else {
ellipse_max = max(intersect_min_line.y, intersect_max_line.y);
}
// Convert ellipse_min/ellipse_max to tiles touched
// First map back to tile coordinates, then subtract.
int min_tile_v = max(rect_min.y,
min(rect_max.y, (int)(ellipse_min / BLOCK_V))
);
int max_tile_v = min(rect_max.y,
max(rect_min.y, (int)(ellipse_max / BLOCK_V + 1))
);
tiles_count += max_tile_v - min_tile_v;
// Only update keys array if it exists.
if (gaussian_keys_unsorted != nullptr) {
// Loop over tiles and add to keys array
for (int v = min_tile_v; v < max_tile_v; v++)
{
// For each tile that the Gaussian overlaps, emit a
// key/value pair. The key is | tile ID | depth |,
// and the value is the ID of the Gaussian. Sorting the values
// with this key yields Gaussian IDs in a list, such that they
// are first sorted by tile and then by depth.
uint64_t key = isY ? (u * grid.x + v) : (v * grid.x + u);
key <<= 32;
key |= *((uint32_t*)&depth);
gaussian_keys_unsorted[off] = key;
gaussian_values_unsorted[off] = idx;
off++;
}
}
// Max line of this tile slice will be min lin of next tile slice
intersect_min_line = intersect_max_line;
min_line = max_line;
}
return tiles_count;
}
__device__ inline uint32_t duplicateToTilesTouched(
const float2 p, const float4 con_o, const dim3 grid, const float mult,
uint32_t idx, uint32_t off, float depth,
uint64_t* gaussian_keys_unsorted,
uint32_t* gaussian_values_unsorted
)
{
// ---- SNUGBOX Code ---- //
// Calculate discriminant
float disc = con_o.y * con_o.y - con_o.x * con_o.z;
// If ill-formed ellipse, return 0
if (con_o.x <= 0 || con_o.z <= 0 || disc >= 0) {
return 0;
}
// Threshold: opacity * Gaussian = 1 / 255
float t = 2.0f * log(con_o.w * 255.0f);
t = mult * t;
float x_term = sqrt(-(con_o.y * con_o.y * t) / (disc * con_o.x));
x_term = (con_o.y < 0) ? x_term : -x_term;
float y_term = sqrt(-(con_o.y * con_o.y * t) / (disc * con_o.z));
y_term = (con_o.y < 0) ? y_term : -y_term;
float2 bbox_argmin = { p.y - y_term, p.x - x_term };
float2 bbox_argmax = { p.y + y_term, p.x + x_term };
float2 bbox_min = {
computeEllipseIntersection(con_o, disc, t, p, true, bbox_argmin.x).x,
computeEllipseIntersection(con_o, disc, t, p, false, bbox_argmin.y).x
};
float2 bbox_max = {
computeEllipseIntersection(con_o, disc, t, p, true, bbox_argmax.x).y,
computeEllipseIntersection(con_o, disc, t, p, false, bbox_argmax.y).y
};
// Rectangular tile extent of ellipse
int2 rect_min = {
max(0, min((int)grid.x, (int)(bbox_min.x / BLOCK_X))),
max(0, min((int)grid.y, (int)(bbox_min.y / BLOCK_Y)))
};
int2 rect_max = {
max(0, min((int)grid.x, (int)(bbox_max.x / BLOCK_X + 1))),
max(0, min((int)grid.y, (int)(bbox_max.y / BLOCK_Y + 1)))
};
int y_span = rect_max.y - rect_min.y;
int x_span = rect_max.x - rect_min.x;
// If no tiles are touched, return 0
if (y_span * x_span == 0) {
return 0;
}
// If fewer y tiles, loop over y slices else loop over x slices
bool isY = y_span < x_span;
return processTiles(
con_o, disc, t, p,
bbox_min, bbox_max,
bbox_argmin, bbox_argmax,
rect_min, rect_max,
grid, isY,
idx, off, depth,
gaussian_keys_unsorted,
gaussian_values_unsorted
);
}1.Modify gaussian_model.py
# gaussian_model.py
class GaussianModel:
def __init__(self, sh_degree, optimizer_type="default"):
self.active_sh_degree = 0
self.optimizer_type = optimizer_type
self.max_sh_degree = sh_degree
self._xyz = torch.empty(0)
self._features_dc = torch.empty(0)
self._features_rest = torch.empty(0)
self._scaling = torch.empty(0)
self._rotation = torch.empty(0)
self._opacity = torch.empty(0)
self.max_radii2D = torch.empty(0)
self.xyz_gradient_accum = torch.empty(0)
self.xyz_gradient_accum_abs = torch.empty(0) # Here, Please add
self.denom = torch.empty(0)
self.optimizer = None
self.shoptimizer = None # Here, Please add
self.percent_dense = 0
self.spatial_lr_scale = 0
self.setup_functions()
def capture(self, optimizer_type):
if optimizer_type == "default":
return (
self.active_sh_degree,
self._xyz,
self._features_dc,
self._features_rest,
self._scaling,
self._rotation,
self._opacity,
self.max_radii2D,
self.xyz_gradient_accum,
self.xyz_gradient_accum_abs, # Here, Please add
self.denom,
self.optimizer.state_dict(),
self.shoptimizer.state_dict(), # Here, Please add
self.spatial_lr_scale,
)
else:
return (
self.active_sh_degree,
self._xyz,
self._features_dc,
self._features_rest,
self._scaling,
self._rotation,
self._opacity,
self.max_radii2D,
self.xyz_gradient_accum,
self.xyz_gradient_accum_abs, # Here, Please add
self.denom,
self.optimizer.state_dict(),
self.spatial_lr_scale,
)
def restore(self, model_args, training_args):
(self.active_sh_degree,
self._xyz,
self._features_dc,
self._features_rest,
self._scaling,
self._rotation,
self._opacity,
self.max_radii2D,
xyz_gradient_accum,
xyz_gradient_accum_abs, # Here, Please add
denom,
opt_dict,
shopt_dict, # Please add
self.spatial_lr_scale) = model_args
self.training_setup(training_args)
self.xyz_gradient_accum = xyz_gradient_accum
self.xyz_gradient_accum_abs = xyz_gradient_accum_abs # Here, Please add
self.denom = denom
self.optimizer.load_state_dict(opt_dict)
self.shoptimizer.load_state_dict(shopt_dict) # Here, Please add
@property
def get_features_dc(self): # Here, Please add
return self._features_dc
@property
def get_features_rest(self): # Here, Please add
return self._features_rest
def training_setup(self, training_args): # Please add ', training_args'
self.percent_dense = training_args.percent_dense
self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
self.xyz_gradient_accum_abs = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") # Please add
self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
# change learning_rate
l = [
{'params': [self._xyz], 'lr': training_args.position_lr_init * self.spatial_lr_scale, "name": "xyz"},
{'params': [self._features_dc], 'lr': training_args.lowfeature_lr, "name": "f_dc"},
{'params': [self._opacity], 'lr': training_args.opacity_lr, "name": "opacity"},
{'params': [self._scaling], 'lr': training_args.scaling_lr, "name": "scaling"},
{'params': [self._rotation], 'lr': training_args.rotation_lr, "name": "rotation"}
]
sh_l = [{'params': [self._features_rest], 'lr': training_args.highfeature_lr / 20.0, "name": "f_rest"}]
if self.optimizer_type == "default":
self.optimizer = torch.optim.Adam(l, lr=0.0, eps=1e-15)
self.shoptimizer = torch.optim.Adam(sh_l, lr=0.0, eps=1e-15) # Here, Please add
elif self.optimizer_type == "sparse_adam":
self.optimizer = SparseGaussianAdam(l + sh_l, lr=0.0, eps=1e-15)
self.xyz_scheduler_args = get_expon_lr_func(lr_init=training_args.position_lr_init*self.spatial_lr_scale,
lr_final=training_args.position_lr_final*self.spatial_lr_scale,
lr_delay_mult=training_args.position_lr_delay_mult,
max_steps=training_args.position_lr_max_steps)
def optimizer_step(self, iteration): # Here, Please add
''' An optimization schdeuler. The goal is similar to the sparse Adam of taming 3dgs.'''
if iteration <= 15000:
self.optimizer.step()
self.optimizer.zero_grad(set_to_none = True)
if iteration % 16 == 0:
self.shoptimizer.step()
self.shoptimizer.zero_grad(set_to_none = True)
elif iteration <= 20000:
if iteration % 32 ==0:
self.optimizer.step()
self.optimizer.zero_grad(set_to_none = True)
self.shoptimizer.step()
self.shoptimizer.zero_grad(set_to_none = True)
else:
if iteration % 64 ==0:
self.optimizer.step()
self.optimizer.zero_grad(set_to_none = True)
self.shoptimizer.step()
self.shoptimizer.zero_grad(set_to_none = True)
def replace_tensor_to_optimizer(self, tensor, name): # Please replace
optimizable_tensors = {}
for group in self.optimizer.param_groups:
if group["name"] == name:
stored_state = self.optimizer.state.get(group['params'][0], None)
stored_state["exp_avg"] = torch.zeros_like(tensor)
stored_state["exp_avg_sq"] = torch.zeros_like(tensor)
del self.optimizer.state[group['params'][0]]
group["params"][0] = nn.Parameter(tensor.requires_grad_(True))
self.optimizer.state[group['params'][0]] = stored_state
optimizable_tensors[group["name"]] = group["params"][0]
return optimizable_tensors
def _prune_optimizer(self, mask): # Please replace
optimizable_tensors = {}
optimizers = [self.optimizer]
if self.shoptimizer: optimizers.append(self.shoptimizer)
for opt in optimizers:
for group in opt.param_groups:
stored_state = opt.state.get(group['params'][0], None)
if stored_state is not None:
stored_state["exp_avg"] = stored_state["exp_avg"][mask]
stored_state["exp_avg_sq"] = stored_state["exp_avg_sq"][mask]
del opt.state[group['params'][0]]
group["params"][0] = nn.Parameter((group["params"][0][mask].requires_grad_(True)))
opt.state[group['params'][0]] = stored_state
optimizable_tensors[group["name"]] = group["params"][0]
else:
group["params"][0] = nn.Parameter(group["params"][0][mask].requires_grad_(True))
optimizable_tensors[group["name"]] = group["params"][0]
return optimizable_tensors
def prune_points(self, mask): # Please replace
valid_points_mask = ~mask
optimizable_tensors = self._prune_optimizer(valid_points_mask)
self._xyz = optimizable_tensors["xyz"]
self._features_dc = optimizable_tensors["f_dc"]
self._features_rest = optimizable_tensors["f_rest"]
self._opacity = optimizable_tensors["opacity"]
self._scaling = optimizable_tensors["scaling"]
self._rotation = optimizable_tensors["rotation"]
self.xyz_gradient_accum = self.xyz_gradient_accum[valid_points_mask]
self.xyz_gradient_accum_abs = self.xyz_gradient_accum_abs[valid_points_mask] # Please add
self.denom = self.denom[valid_points_mask]
self.max_radii2D = self.max_radii2D[valid_points_mask]
if self.tmp_radii is not None:
self.tmp_radii = self.tmp_radii[valid_points_mask] # Please replace, if tmp_radii
def cat_tensors_to_optimizer(self, tensors_dict): # Please replace
optimizable_tensors = {}
optimizers = [self.optimizer]
if self.shoptimizer: optimizers.append(self.shoptimizer)
for opt in optimizers:
for group in opt.param_groups:
assert len(group["params"]) == 1
extension_tensor = tensors_dict[group["name"]]
stored_state = opt.state.get(group['params'][0], None)
if stored_state is not None:
stored_state["exp_avg"] = torch.cat((stored_state["exp_avg"], torch.zeros_like(extension_tensor)), dim=0)
stored_state["exp_avg_sq"] = torch.cat((stored_state["exp_avg_sq"], torch.zeros_like(extension_tensor)), dim=0)
del opt.state[group['params'][0]]
group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
opt.state[group['params'][0]] = stored_state
optimizable_tensors[group["name"]] = group["params"][0]
else:
group["params"][0] = nn.Parameter(torch.cat((group["params"][0], extension_tensor), dim=0).requires_grad_(True))
optimizable_tensors[group["name"]] = group["params"][0]
return optimizable_tensors
def densification_postfix(self, new_xyz, new_features_dc, new_features_rest, new_opacities, new_scaling, new_rotation, new_tmp_radii): # Please replace
d = {"xyz": new_xyz,
"f_dc": new_features_dc,
"f_rest": new_features_rest,
"opacity": new_opacities,
"scaling" : new_scaling,
"rotation" : new_rotation}
optimizable_tensors = self.cat_tensors_to_optimizer(d)
self._xyz = optimizable_tensors["xyz"]
self._features_dc = optimizable_tensors["f_dc"]
self._features_rest = optimizable_tensors["f_rest"]
self._opacity = optimizable_tensors["opacity"]
self._scaling = optimizable_tensors["scaling"]
self._rotation = optimizable_tensors["rotation"]
self.tmp_radii = torch.cat((self.tmp_radii, new_tmp_radii))
self.xyz_gradient_accum = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
self.xyz_gradient_accum_abs = torch.zeros((self.get_xyz.shape[0], 1), device="cuda") # abs
self.denom = torch.zeros((self.get_xyz.shape[0], 1), device="cuda")
self.max_radii2D = torch.zeros((self.get_xyz.shape[0]), device="cuda")
def add_densification_stats(self, viewspace_point_tensor, update_filter): # Please replace
self.xyz_gradient_accum[update_filter] += torch.norm(viewspace_point_tensor.grad[update_filter,:2], dim=-1, keepdim=True)
self.xyz_gradient_accum_abs[update_filter] += torch.norm(viewspace_point_tensor.grad[update_filter, 2:], dim=-1, keepdim=True)
self.denom[update_filter] += 1
2.Add Abs_grad in the rasterization process.
# submoudles/diff-gaussian-.../rasterize_points.cu, please replace
torch::Tensor dL_dmeans2D = torch::zeros({P, 4}, means3D.options()); // abs# submoudles/diff-gaussian-.../cuda_rasterizer/rasterizer_impl.cu
void CudaRasterizer::Rasterizer::backward(
const int P, int D, int M, int R, int B,
const float* background,
const int width, int height,
const float* means3D,
const float* dc,
const float* shs,
const float* colors_precomp,
const float* scales,
const float scale_modifier,
const float* rotations,
const float* cov3D_precomp,
const float* viewmatrix,
const float* projmatrix,
const float* campos,
const float tan_fovx, float tan_fovy,
const int* radii,
char* geom_buffer,
char* binning_buffer,
char* img_buffer,
char* sample_buffer,
const float* dL_dpix,
float* dL_dmean2D,
float* dL_dconic,
float* dL_dopacity,
float* dL_dcolor,
float* dL_dmean3D,
float* dL_dcov3D,
float* dL_ddc,
float* dL_dsh,
float* dL_dscale,
float* dL_drot,
bool debug)
{
GeometryState geomState = GeometryState::fromChunk(geom_buffer, P);
BinningState binningState = BinningState::fromChunk(binning_buffer, R);
ImageState imgState = ImageState::fromChunk(img_buffer, width * height);
SampleState sampleState = SampleState::fromChunk(sample_buffer, B);
if (radii == nullptr)
{
radii = geomState.internal_radii;
}
const float focal_y = height / (2.0f * tan_fovy);
const float focal_x = width / (2.0f * tan_fovx);
const dim3 tile_grid((width + BLOCK_X - 1) / BLOCK_X, (height + BLOCK_Y - 1) / BLOCK_Y, 1);
const dim3 block(BLOCK_X, BLOCK_Y, 1);
// Compute loss gradients w.r.t. 2D mean position, conic matrix,
// opacity and RGB of Gaussians from per-pixel loss gradients.
// If we were given precomputed colors and not SHs, use them.
const float* color_ptr = (colors_precomp != nullptr) ? colors_precomp : geomState.rgb;
CHECK_CUDA(BACKWARD::render(
tile_grid,
block,
imgState.ranges,
binningState.point_list,
width, height, R, B,
imgState.bucket_offsets,
sampleState.bucket_to_tile,
sampleState.T,
sampleState.ar,
background,
geomState.means2D,
geomState.conic_opacity,
color_ptr,
imgState.accum_alpha,
imgState.n_contrib,
imgState.max_contrib,
imgState.pixel_colors,
dL_dpix,
(float4*)dL_dmean2D, # Here, please replace
(float4*)dL_dconic,
dL_dopacity,
dL_dcolor), debug)
// Take care of the rest of preprocessing. Was the precomputed covariance
// given to us or a scales/rot pair? If precomputed, pass that. If not,
// use the one we computed ourselves.
const float* cov3D_ptr = (cov3D_precomp != nullptr) ? cov3D_precomp : geomState.cov3D;
CHECK_CUDA(BACKWARD::preprocess(P, D, M,
(float3*)means3D,
radii,
dc,
shs,
geomState.clamped,
(glm::vec3*)scales,
(glm::vec4*)rotations,
scale_modifier,
cov3D_ptr,
viewmatrix,
projmatrix,
focal_x, focal_y,
tan_fovx, tan_fovy,
(glm::vec3*)campos,
(float4*)dL_dmean2D, # Here, please replace
dL_dconic,
(glm::vec3*)dL_dmean3D,
dL_dcolor,
dL_dcov3D,
dL_ddc,
dL_dsh,
(glm::vec3*)dL_dscale,
(glm::vec4*)dL_drot), debug)
}# submoudles/diff-gaussian-.../cuda_rasterizer/backward.h
namespace BACKWARD
{
void render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H, int R, int B,
const uint32_t* per_bucket_tile_offset,
const uint32_t* bucket_to_tile,
const float* sampled_T, const float* sampled_ar,
const float* bg_color,
const float2* means2D,
const float4* conic_opacity,
const float* colors,
const float* final_Ts,
const uint32_t* n_contrib,
const uint32_t* max_contrib,
const float* pixel_colors,
const float* dL_dpixels,
float4* dL_dmean2D, # Here, please replace
float4* dL_dconic2D,
float* dL_dopacity,
float* dL_dcolors);
void preprocess(
int P, int D, int M,
const float3* means,
const int* radii,
const float* dc,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* cov3Ds,
const float* view,
const float* proj,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
const glm::vec3* campos,
const float4* dL_dmean2D, # Here, please replace
const float* dL_dconics,
glm::vec3* dL_dmeans,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_ddc,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot);
}# submoudles/diff-gaussian-.../cuda_rasterizer/backward.cu
template<int C>
__global__ void preprocessCUDA(
int P, int D, int M,
const float3* means,
const int* radii,
const float* dc,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* proj,
const glm::vec3* campos,
const float4* dL_dmean2D, # Here, please replace
glm::vec3* dL_dmeans,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_ddc,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot)
template<uint32_t C>
__global__ void
PerGaussianRenderCUDA(
const uint2* __restrict__ ranges,
const uint32_t* __restrict__ point_list,
int W, int H, int B,
const uint32_t* __restrict__ per_tile_bucket_offset,
const uint32_t* __restrict__ bucket_to_tile,
const float* __restrict__ sampled_T, const float* __restrict__ sampled_ar,
const float* __restrict__ bg_color,
const float2* __restrict__ points_xy_image,
const float4* __restrict__ conic_opacity,
const float* __restrict__ colors,
const float* __restrict__ final_Ts,
const uint32_t* __restrict__ n_contrib,
const uint32_t* __restrict__ max_contrib,
const float* __restrict__ pixel_colors,
const float* __restrict__ dL_dpixels,
float4* __restrict__ dL_dmean2D, # Here, please replace
float4* __restrict__ dL_dconic2D,
float* __restrict__ dL_dopacity,
float* __restrict__ dL_dcolors
) {
// global_bucket_idx = warp_idx
auto block = cg::this_thread_block();
auto my_warp = cg::tiled_partition<32>(block);
uint32_t global_bucket_idx = block.group_index().x * my_warp.meta_group_size() + my_warp.meta_group_rank();
bool valid_bucket = global_bucket_idx < (uint32_t) B;
if (!valid_bucket) return;
bool valid_splat = false;
uint32_t tile_id, bbm;
uint2 range;
int num_splats_in_tile, bucket_idx_in_tile;
int splat_idx_in_tile, splat_idx_global;
tile_id = bucket_to_tile[global_bucket_idx];
range = ranges[tile_id];
num_splats_in_tile = range.y - range.x;
// What is the number of buckets before me? what is my offset?
bbm = tile_id == 0 ? 0 : per_tile_bucket_offset[tile_id - 1];
bucket_idx_in_tile = global_bucket_idx - bbm;
splat_idx_in_tile = bucket_idx_in_tile * 32 + my_warp.thread_rank();
splat_idx_global = range.x + splat_idx_in_tile;
valid_splat = (splat_idx_in_tile < num_splats_in_tile);
// if first gaussian in bucket is useless, then others are also useless
if (bucket_idx_in_tile * 32 >= max_contrib[tile_id]) {
return;
}
// Load Gaussian properties into registers
int gaussian_idx = 0;
float2 xy = {0.0f, 0.0f};
float4 con_o = {0.0f, 0.0f, 0.0f, 0.0f};
float c[C] = {0.0f};
if (valid_splat) {
gaussian_idx = point_list[splat_idx_global];
xy = points_xy_image[gaussian_idx];
con_o = conic_opacity[gaussian_idx];
for (int ch = 0; ch < C; ++ch)
c[ch] = colors[gaussian_idx * C + ch];
}
// Gradient accumulation variables
float Register_dL_dmean2D_x = 0.0f;
float Register_dL_dmean2D_y = 0.0f;
float Register_dL_dmean2D_z = 0.0f; //abs # Here, please add
float Register_dL_dmean2D_w = 0.0f; //abs # Here, please add
float Register_dL_dconic2D_x = 0.0f;
float Register_dL_dconic2D_y = 0.0f;
float Register_dL_dconic2D_w = 0.0f;
float Register_dL_dopacity = 0.0f;
float Register_dL_dcolors[C] = {0.0f};
// tile metadata
const uint32_t horizontal_blocks = (W + BLOCK_X - 1) / BLOCK_X;
const uint2 tile = {tile_id % horizontal_blocks, tile_id / horizontal_blocks};
const uint2 pix_min = {tile.x * BLOCK_X, tile.y * BLOCK_Y};
// values useful for gradient calculation
float T;
float T_final;
float last_contributor;
float ar[C];
float dL_dpixel[C];
const float ddelx_dx = 0.5 * W;
const float ddely_dy = 0.5 * H;
// shared memory
__shared__ float Shared_sampled_ar[32 * C + 1];
sampled_ar += global_bucket_idx * BLOCK_SIZE * C;
__shared__ float Shared_pixels[32 * C];
// iterate over all pixels in the tile
#pragma unroll
for (int i = 0; i < BLOCK_SIZE + 31; ++i) {
if (i % 32 == 0) {
for (int ch = 0; ch < C; ++ch) {
int shift = BLOCK_SIZE * ch + i + block.thread_rank();
Shared_sampled_ar[ch * 32 + block.thread_rank()] = sampled_ar[shift];
}
const uint32_t local_id = i + block.thread_rank();
const uint2 pix = {pix_min.x + local_id % BLOCK_X, pix_min.y + local_id / BLOCK_X};
const uint32_t id = W * pix.y + pix.x;
for (int ch = 0; ch < C; ++ch) {
Shared_pixels[ch * 32 + block.thread_rank()] = pixel_colors[ch * H * W + id];
}
block.sync();
}
// SHUFFLING
// At this point, T already has my (1 - alpha) multiplied.
// So pass this ready-made T value to next thread.
T = my_warp.shfl_up(T, 1);
last_contributor = my_warp.shfl_up(last_contributor, 1);
T_final = my_warp.shfl_up(T_final, 1);
for (int ch = 0; ch < C; ++ch) {
ar[ch] = my_warp.shfl_up(ar[ch], 1);
dL_dpixel[ch] = my_warp.shfl_up(dL_dpixel[ch], 1);
}
// which pixel index should this thread deal with?
int idx = i - my_warp.thread_rank();
const uint2 pix = {pix_min.x + idx % BLOCK_X, pix_min.y + idx / BLOCK_X};
const uint32_t pix_id = W * pix.y + pix.x;
const float2 pixf = {(float) pix.x, (float) pix.y};
bool valid_pixel = pix.x < W && pix.y < H;
// every 32nd thread should read the stored state from memory
// TODO: perhaps store these things in shared memory?
if (valid_splat && valid_pixel && my_warp.thread_rank() == 0 && idx < BLOCK_SIZE) {
T = sampled_T[global_bucket_idx * BLOCK_SIZE + idx];
int ii = i % 32;
for (int ch = 0; ch < C; ++ch)
ar[ch] = -Shared_pixels[ch * 32 + ii] + Shared_sampled_ar[ch * 32 + ii];
T_final = final_Ts[pix_id];
last_contributor = n_contrib[pix_id];
for (int ch = 0; ch < C; ++ch) {
dL_dpixel[ch] = dL_dpixels[ch * H * W + pix_id];
}
}
// do work
if (valid_splat && valid_pixel && 0 <= idx && idx < BLOCK_SIZE) {
if (W <= pix.x || H <= pix.y) continue;
if (splat_idx_in_tile >= last_contributor) continue;
// compute blending values
const float2 d = { xy.x - pixf.x, xy.y - pixf.y };
const float power = -0.5f * (con_o.x * d.x * d.x + con_o.z * d.y * d.y) - con_o.y * d.x * d.y;
if (power > 0.0f) continue;
const float G = exp(power);
const float alpha = min(0.99f, con_o.w * G);
if (alpha < 1.0f / 255.0f) continue;
const float dchannel_dcolor = alpha * T;
const float one_minus_alpha_reci = 1.0f / (1.0f - alpha);
// add the gradient contribution of this pixel to the gaussian
float dL_dalpha = 0.0f;
for (int ch = 0; ch < C; ++ch) {
ar[ch] += dchannel_dcolor * c[ch];
const float &dL_dchannel = dL_dpixel[ch];
Register_dL_dcolors[ch] += dchannel_dcolor * dL_dchannel;
dL_dalpha += (c[ch] * T + one_minus_alpha_reci * ar[ch]) * dL_dchannel;
}
float bg_dot_dpixel = 0.0f;
for (int ch = 0; ch < C; ++ch) {
bg_dot_dpixel += bg_color[ch] * dL_dpixel[ch];
}
dL_dalpha += (-T_final * one_minus_alpha_reci) * bg_dot_dpixel;
T *= (1.0f - alpha);
// Helpful reusable temporary variables
const float dL_dG = con_o.w * dL_dalpha;
const float gdx = G * d.x;
const float gdy = G * d.y;
const float dG_ddelx = -gdx * con_o.x - gdy * con_o.y;
const float dG_ddely = -gdy * con_o.z - gdx * con_o.y;
// accumulate the gradients
const float tmp_x = dL_dG * dG_ddelx * ddelx_dx;
Register_dL_dmean2D_x += tmp_x;
const float tmp_y = dL_dG * dG_ddely * ddely_dy;
Register_dL_dmean2D_y += tmp_y;
//abs
// const float tmp_z = fabs(dL_dG * dG_ddelx * ddelx_dx);
Register_dL_dmean2D_z += fabs(tmp_x); # Here, please add
// const float tmp_w = fabs(dL_dG * dG_ddely * ddely_dy);
Register_dL_dmean2D_w += fabs(tmp_y); # Here, please add
Register_dL_dconic2D_x += -0.5f * gdx * d.x * dL_dG;
Register_dL_dconic2D_y += -0.5f * gdx * d.y * dL_dG;
Register_dL_dconic2D_w += -0.5f * gdy * d.y * dL_dG;
Register_dL_dopacity += G * dL_dalpha;
}
}
// finally add the gradients using atomics
if (valid_splat) {
atomicAdd(&dL_dmean2D[gaussian_idx].x, Register_dL_dmean2D_x);
atomicAdd(&dL_dmean2D[gaussian_idx].y, Register_dL_dmean2D_y);
atomicAdd(&dL_dmean2D[gaussian_idx].z, Register_dL_dmean2D_z); # Here, please add
atomicAdd(&dL_dmean2D[gaussian_idx].w, Register_dL_dmean2D_w); # Here, please add
atomicAdd(&dL_dconic2D[gaussian_idx].x, Register_dL_dconic2D_x);
atomicAdd(&dL_dconic2D[gaussian_idx].y, Register_dL_dconic2D_y);
atomicAdd(&dL_dconic2D[gaussian_idx].w, Register_dL_dconic2D_w);
atomicAdd(&dL_dopacity[gaussian_idx], Register_dL_dopacity);
for (int ch = 0; ch < C; ++ch) {
atomicAdd(&dL_dcolors[gaussian_idx * C + ch], Register_dL_dcolors[ch]);
}
}
}
void BACKWARD::preprocess(
int P, int D, int M,
const float3* means3D,
const int* radii,
const float* dc,
const float* shs,
const bool* clamped,
const glm::vec3* scales,
const glm::vec4* rotations,
const float scale_modifier,
const float* cov3Ds,
const float* viewmatrix,
const float* projmatrix,
const float focal_x, float focal_y,
const float tan_fovx, float tan_fovy,
const glm::vec3* campos,
const float4* dL_dmean2D, # Here, please replace
const float* dL_dconic,
glm::vec3* dL_dmean3D,
float* dL_dcolor,
float* dL_dcov3D,
float* dL_ddc,
float* dL_dsh,
glm::vec3* dL_dscale,
glm::vec4* dL_drot)
{
// Propagate gradients for the path of 2D conic matrix computation.
// Somewhat long, thus it is its own kernel rather than being part of
// "preprocess". When done, loss gradient w.r.t. 3D means has been
// modified and gradient w.r.t. 3D covariance matrix has been computed.
computeCov2DCUDA << <(P + 255) / 256, 256 >> > (
P,
means3D,
radii,
cov3Ds,
focal_x,
focal_y,
tan_fovx,
tan_fovy,
viewmatrix,
dL_dconic,
(float3*)dL_dmean3D,
dL_dcov3D);
// Propagate gradients for remaining steps: finish 3D mean gradients,
// propagate color gradients to SH (if desireD), propagate 3D covariance
// matrix gradients to scale and rotation.
preprocessCUDA<NUM_CHAFFELS> << < (P + 255) / 256, 256 >> > (
P, D, M,
(float3*)means3D,
radii,
dc,
shs,
clamped,
(glm::vec3*)scales,
(glm::vec4*)rotations,
scale_modifier,
projmatrix,
campos,
(float4*)dL_dmean2D, # Here, please replace
(glm::vec3*)dL_dmean3D,
dL_dcolor,
dL_dcov3D,
dL_ddc,
dL_dsh,
dL_dscale,
dL_drot);
}
void BACKWARD::render(
const dim3 grid, dim3 block,
const uint2* ranges,
const uint32_t* point_list,
int W, int H, int R, int B,
const uint32_t* per_bucket_tile_offset,
const uint32_t* bucket_to_tile,
const float* sampled_T, const float* sampled_ar,
const float* bg_color,
const float2* means2D,
const float4* conic_opacity,
const float* colors,
const float* final_Ts,
const uint32_t* n_contrib,
const uint32_t* max_contrib,
const float* pixel_colors,
const float* dL_dpixels,
float4* dL_dmean2D, # Here, please replace
float4* dL_dconic2D,
float* dL_dopacity,
float* dL_dcolors)
{
const int THREADS = 32;
PerGaussianRenderCUDA<NUM_CHAFFELS> <<<((B*32) + THREADS - 1) / THREADS,THREADS>>>(
ranges,
point_list,
W, H, B,
per_bucket_tile_offset,
bucket_to_tile,
sampled_T, sampled_ar,
bg_color,
means2D,
conic_opacity,
colors,
final_Ts,
n_contrib,
max_contrib,
pixel_colors,
dL_dpixels,
dL_dmean2D,
dL_dconic2D,
dL_dopacity,
dL_dcolors
);
}- Hyperparameters
# arguments/__init__.py
class OptimizationParams(ParamGroup):
def __init__(self, parser):
# fastgs parameters
self.loss_thresh = 0.1
self.grad_abs_thresh = 0.0012
self.highfeature_lr = 0.005
self.lowfeature_lr = 0.0025
self.grad_thresh = 0.0002
self.dense = 0.001
self.mult = 0.5 # multiplier for the compact box to control the tile number of each splat
# train.py
gaussians.training_setup(opt) # add opt
# Optimization step
if iteration < opt.iterations:
if opt.optimizer_type == "default":
gaussians.optimizer_step(iteration) # please use this
# render.py
parser.add_argument("--mult", type=float, default=0.5)