FSOD-VFM/metric.py at official · Intellindust-AI-Lab/FSOD-VFM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
import os
import json
import torch
import support_util
import query_util
from tqdm import tqdm
import PIL.Image
import numpy as np
from pycocotools import mask as mask_utils
import pycocotools.coco
import pycocotools.cocoeval
import torch.nn.functional as F
import time
from ensemble_boxes import weighted_boxes_fusion
import numpy as np
from scipy.linalg import eigh
from collections import defaultdict
import copy
from torchvision.ops import batched_nms
import cv2
from ensemble_boxes import *


def get_category_id_to_name(coco_style_loader):
    """
    Given a COCO object, return a mapping from category_id to category_name.
    """
    cats = coco_style_loader.loadCats(coco_style_loader.getCatIds())
    return {cat['id']: cat['name'] for cat in cats}

def graph_diffusion_ios(masks_binary, labels, class_num, max_iter, alpha, rank_score=True,
                     tol=1e-6):

    n_masks = masks_binary.shape[0]
    masks = masks_binary.reshape(n_masks, -1).to(dtype=torch.float32)
    rw_ios = torch.zeros((n_masks,), device=masks_binary.device, dtype=torch.float32)

    if n_masks == 1:
        return rw_ios

    for cat_ind in range(class_num):
        cat_ind_tensor = torch.tensor(cat_ind, device=labels.device, dtype=labels.dtype)
        select_idxs = (labels == cat_ind_tensor)

        if select_idxs.sum() == 0:
            continue

        _masks = masks[select_idxs]
        if _masks.shape[0] == 0:
            continue

        n_cat = _masks.shape[0]

        # Compute base IoU matrix
        pos_num = _masks.sum(dim=-1).to(dtype=torch.float32)
        pos_num = torch.clamp(pos_num, min=1e-6)

        inter_num = _masks @ _masks.t()
        inter_num.fill_diagonal_(0.0)
        if rank_score:
            inter_num = torch.tril(inter_num, diagonal=0)

        # Normalized IoU matrix

        iou_matrix = inter_num / pos_num[:, None]

        personal_vector = iou_matrix.max(dim=-1)[0]

        P = torch.zeros_like(iou_matrix)

        # Normalize each row to create probability distribution
        row_sums = iou_matrix.sum(dim=1)  # [n_cat] - remove keepdim=True
        valid_rows = row_sums > 1e-10     # [n_cat] - use small threshold to avoid zero division

        if valid_rows.any():
            # use safe index way to avoid dimension mismatch
            valid_indices = torch.where(valid_rows)[0]
            for idx in valid_indices:
                P[idx] = iou_matrix[idx] / valid_rows[idx]

        # Initialize stationary distribution
        pi = torch.ones(n_cat, device=P.device, dtype=P.dtype) / n_cat


        for iter_idx in range(max_iter):
            pi_old = pi.clone()

            # Random walk step
            pi = alpha * (P @ pi) + (1 - alpha) * personal_vector

            if torch.norm(pi - pi_old) < tol:
                break

        rw_ios[select_idxs] += pi

    return rw_ios

def generate_coco_style_predictions_upn(coco_style_loader,
                                    image_root_dir,
                                    sam2_mask_predictor,
                                    feat_extractor_name,
                                    feat_extractor,
                                    image_transform,
                                    proto_feat,
                                    proto_cls,
                                    upn,  # UPN model passed from main.py
                                    diffusion_steps,
                                    alp,
                                    lamb,
                                    device='cuda',
                                    min_threshold=0.01,
                                    ):
    """
    Args:
        coco_style_loader: COCO object for VOC2007 test set.
        image_root_dir: root directory where image files are stored.
        sam2_mask_predictor: initialized SAM2 mask predictor.
        feat_extractor_name: name of feature extractor (DINOV2).
        feat_extractor: feature extractor, e.g. DINOv2 model.
        image_transform: preprocessing transform for feat_extractor.
        proto_feat: prototype feature (tensor).
        proto_cls: prototype cls name.
        upn: initialized UPN model for proposal generation.
        diffusion_steps: number of diffusion steps.
        alp: alpha in diffusion.
        lamb: lamda for decay.
        device: torch or CUDA device.
        min_threshold: minimum threshold for proposal filtering.
    Returns:
        List of prediction dicts in COCO format.
    """

    id_to_name = get_category_id_to_name(coco_style_loader)
    name_to_id = {v: k for k, v in id_to_name.items()}
    batch_size = 32  # Process N boxes at a time in SAM2
    results = []
    if feat_extractor_name == 'DINOV2':
        extractor = support_util.get_dinov2_features
    elif feat_extractor_name == 'RADIO':
        from model.radio import get_radio_features
        extractor = get_radio_features
    else:
        raise ValueError(f"Unsupported feature extractor: {feat_extractor_name}")

    # upn info
    candid_prompt = ["fine_grained_prompt", "coarse_grained_prompt"]
    # UPN model is now passed as parameter from main.py
    # Load all image metadata from COCO
    for img_dict in tqdm(coco_style_loader.dataset['images'], desc='Generating predictions'):
    #for img_dict in coco_style_loader.dataset['images']:
        img_id = img_dict['id']
        file_name = img_dict['file_name']
        img_path = os.path.join(image_root_dir, file_name)

        try:
            img_pil = PIL.Image.open(img_path).convert("RGB")
        except Exception as e:
            print(f"[Warning] Failed to load image {img_path}: {e}")
            continue

        proposals = upn.inference(img_pil, candid_prompt[1])
        proposals_coarse = upn.filter(proposals, min_score=0.01, nms_value=1)

        # chek proposals
        if proposals is None or len(proposals.get('original_xyxy_boxes', [])) == 0:
            continue

        else:
            proposals = upn.filter(proposals, min_score=min_threshold, nms_value=1)


            if len(proposals.get('original_xyxy_boxes', [])) == 0:
                proposals = proposals_coarse
                boxes = proposals['original_xyxy_boxes'][0]
                scores = proposals['scores'][0]
            else:
                boxes = proposals['original_xyxy_boxes'][0]
                scores = proposals['scores'][0]


        # 1. Extract DINOv2 feature map
        feat_map = extractor(feat_extractor, image_transform, img_pil, device=device)

        # 2. with the upn info, to get the candidate mask in iter
        sam2_mask_predictor.set_image(img_pil)

        # Sort boxes and scores by scores in descending order and take top 200
        if len(boxes) > 0 and len(scores) > 0:
            # Create list of (score, box) pairs
            box_score_pairs = list(zip(scores, boxes))
            # Sort by score in descending order
            box_score_pairs.sort(key=lambda x: x[0], reverse=True)
            # Take top 500, maybe 100 is better
            top_500_pairs = box_score_pairs[:100]
            # Unzip back to scores and boxes
            scores, boxes = zip(*top_500_pairs)
            scores = list(scores)
            boxes = list(boxes)

        # collect all the results of the current image
        img_results = []

        if len(boxes) > 0:

            # Clip all boxes to valid image region
            iw, ih = img_pil.size
            clipped_boxes = []
            for box in boxes:
                x1, y1, x2, y2 = box
                # Clip coordinates to image boundaries
                x1 = max(0, min(x1, iw))
                y1 = max(0, min(y1, ih))
                x2 = max(0, min(x2, iw))
                y2 = max(0, min(y2, ih))
                # Ensure valid bbox (width and height > 0)
                if x2 > x1 and y2 > y1:
                    clipped_boxes.append([x1, y1, x2, y2])

            boxes = clipped_boxes

            for i in range(0, len(boxes), batch_size):
                # Get current batch of boxes
                batch_end = min(i + batch_size, len(boxes))
                batch_boxes = boxes[i:batch_end]
                batch_scores = scores[i:batch_end]

                # Convert batch boxes to numpy array format expected by SAM2
                batch_boxes_array = np.array(batch_boxes)

                # Predict masks for this batch
                masks, mask_scores, masks_256 = sam2_mask_predictor.predict(
                    point_coords=None,
                    point_labels=None,
                    box=batch_boxes_array,
                    multimask_output=False  # Get one mask per box
                )

                # Process each mask and corresponding box in the batch
                for j, (bbox, score, mask, mask_score) in enumerate(zip(batch_boxes, batch_scores, masks, mask_scores)):

                    # Handle different mask formats
                    if isinstance(mask, (list, tuple)) and len(mask) > 0:
                        mask_to_use = mask[0]
                    else:
                        mask_to_use = mask

                    masks_resize = support_util.resize_mask_to_features(mask_to_use, feat_map.shape[2:])
                    masks_resize = torch.from_numpy(masks_resize).cuda()
                    masked_feat = feat_map * masks_resize
                    valid_pixel_count = masks_resize.sum()
                    feat_vec = F.normalize(masked_feat.sum(dim=[2, 3]) / (valid_pixel_count + 1e-7), eps=1e-2)
                    sims = feat_vec @ proto_feat
                    top_score, top_cls = torch.max(sims, dim=1)

                    cat_id = name_to_id.get(proto_cls[top_cls[0].item()])
                    if cat_id is None:
                        continue

                    # Handle mask encoding - mask_utils.encode returns a list
                    encoded_mask = mask_utils.encode(np.asfortranarray(mask_to_use.astype(np.uint8)))


                    # If encoded_mask is a list, take the first element
                    if isinstance(encoded_mask, (list, tuple)) and len(encoded_mask) > 0:
                        encoded_mask = encoded_mask[0]

                    # Now encoded_mask should be a dict, handle counts
                    if isinstance(encoded_mask, dict) and 'counts' in encoded_mask:
                        if isinstance(encoded_mask['counts'], bytes):
                            encoded_mask['counts'] = encoded_mask['counts'].decode('utf-8')
                    else:
                        print(f"[WARNING] encoded_mask is not a dict or missing counts: {type(encoded_mask)}")
                        continue

                    masks_for_ios = masks_resize.clone()
                    if masks_for_ios.dim() == 2:  # [H, W]
                        masks_for_ios = masks_for_ios.unsqueeze(0).unsqueeze(0)  # [1, 1, H, W]
                    elif masks_for_ios.dim() == 3:  # [1, H, W]
                        masks_for_ios = masks_for_ios.unsqueeze(0)  # [1, 1, H, W]

                    img_results.append({
                        'image_id': img_id,
                        'feat':feat_vec.to(torch.float32).cpu().numpy(),
                        'masks_for_ios':masks_for_ios.to(torch.float32).cpu().numpy(),
                        'category_id': cat_id,
                        'segmentation': encoded_mask,
                        'bbox': [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]],
                        'score': float(top_score.item()),
                    })

        # Graph Diffusion
        if img_results:
            # collect all the masks and related information for ios calculation
            all_masks = []
            all_categories = []
            all_object_sims = []
            all_feats = []

            for item in img_results:
                all_masks.append(torch.from_numpy(item['masks_for_ios']).cuda())
                all_categories.append(item['category_id'])
                all_feats.append(torch.from_numpy(item['feat']).cuda())

            # stack all the masks and features
            if all_masks:
                stacked_masks = torch.stack(all_masks, dim=0)  # [n_masks, 1, H, W]
                # fix the categories indexing problem: map the actual category_id to a continuous index
                unique_categories = list(set(all_categories))
                category_to_idx = {cat: idx for idx, cat in enumerate(unique_categories)}
                stacked_categories = torch.tensor([category_to_idx[cat] for cat in all_categories],
                                               device=stacked_masks.device, dtype=torch.long)

                # compute the ios between all the masks
                try:
                    # Extract scores for sorting if softmerge_sort is enabled

                    ios_result = graph_diffusion_ios(stacked_masks, stacked_categories,
                                                 len(unique_categories), max_iter=diffusion_steps, alpha=alp)

                    ios = ios_result
                    # Apply score decay normally when no sorting
                    for i, item in enumerate(img_results):
                        if i < len(ios):
                            score_decay = 1 - ios[i]
                            if score_decay < 0:
                                score_decay = torch.tensor(0.0)

                            item['score'] = float(item['score'] * torch.pow(score_decay, lamb))

                except Exception as e:
                    print(f"[Warning] compute_semantic_ios failed: {e}, skipping IoU computation")

        # top 100
        if img_results:
            img_results.sort(key=lambda x: x['score'], reverse=True)
            top_100_img_results = img_results[:100]
            results.extend(top_100_img_results)

    return results


def run_coco_eval(gt_json_path, prediction_results, pred_json='temp_predictions.json',
                   target_categories=None, filter_by_categories=True, save_results=True):

    # remove key to save storage and convert numpy arrays to lists
    for result in prediction_results:
        if 'feat' in result:
            del result['feat']
        if 'segmentation' in result:
            del result['segmentation']
        if 'masks_for_ios' in result:
            del result['masks_for_ios']
        # Convert any remaining numpy arrays to lists
        for key, value in result.items():
            if isinstance(value, np.ndarray):
                result[key] = value.tolist()

    # Save prediction results to file
    with open(pred_json, 'w') as f:
        json.dump(prediction_results, f)

    # Load ground truth
    coco_gt = pycocotools.coco.COCO(gt_json_path)

    # Add required fields if missing
    if 'info' not in coco_gt.dataset:
        coco_gt.dataset['info'] = {"description": "Auto-added info"}
    if 'licenses' not in coco_gt.dataset:
        coco_gt.dataset['licenses'] = []

    # Determine if segmentation evaluation is possible
    has_segmentation = any(
        isinstance(ann.get("segmentation"), (list, dict)) and ann.get("segmentation")
        for ann in coco_gt.dataset.get("annotations", [])
    )

    # Load predictions
    coco_dt = coco_gt.loadRes(prediction_results)

    # Choose evaluation types
    eval_types = ['bbox']
    if has_segmentation:
        eval_types.append('segm')

    # Store evaluation results
    eval_results = {}

    if filter_by_categories and target_categories:
        for iou_type in eval_types:
            print(f"\n====== COCO Evaluation (Target): {iou_type.upper()} ======")
            coco_eval = pycocotools.cocoeval.COCOeval(coco_gt, coco_dt, iouType=iou_type)
            target_cat_ids = coco_gt.getCatIds(catNms=target_categories)
            print(f"target_cat_ids: {target_cat_ids}")
            print(f"target_categories: {target_categories}")
            coco_eval.params.catIds = target_cat_ids

            coco_eval.evaluate()
            coco_eval.accumulate()
            coco_eval.summarize()

            # Store results for this evaluation type
            eval_results[iou_type] = {
                'target_categories': target_categories,
                'stats': coco_eval.stats.tolist(),
                'precision': coco_eval.eval['precision'].tolist() if 'precision' in coco_eval.eval else None,
                'recall': coco_eval.eval['recall'].tolist() if 'recall' in coco_eval.eval else None,
                'scores': coco_eval.eval['scores'].tolist() if 'scores' in coco_eval.eval else None
            }

    else:
        for iou_type in eval_types:
            print(f"\n====== COCO Evaluation: {iou_type.upper()} ======")
            coco_eval = pycocotools.cocoeval.COCOeval(coco_gt, coco_dt, iouType=iou_type)
            coco_eval.evaluate()
            coco_eval.accumulate()
            coco_eval.summarize()

            # Store results for this evaluation type
            eval_results[iou_type] = {
                'stats': coco_eval.stats.tolist(),
                'precision': coco_eval.eval['precision'].tolist() if 'precision' in coco_eval.eval else None,
                'recall': coco_eval.eval['recall'].tolist() if 'recall' in coco_eval.eval else None,
                'scores': coco_eval.eval['scores'].tolist() if 'scores' in coco_eval.eval else None
            }

    # Save evaluation results to JSON file
    if save_results:
        import os
        import datetime

        # Create results directory if it doesn't exist
        results_dir = './results'
        os.makedirs(results_dir, exist_ok=True)

        # Generate filename with timestamp
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        results_filename = f"coco_eval_results_{timestamp}.json"
        results_path = os.path.join(results_dir, results_filename)

        # Prepare results data
        results_data = {
            'timestamp': timestamp,
            'gt_json_path': gt_json_path,
            'pred_json': pred_json,
            'target_categories': target_categories,
            'filter_by_categories': filter_by_categories,
            'evaluation_results': eval_results,
            'stats_description': {
                'AP': 'Average Precision',
                'AP50': 'Average Precision at IoU=0.50',
                'AP75': 'Average Precision at IoU=0.75',
                'APs': 'Average Precision for small objects',
                'APm': 'Average Precision for medium objects',
                'APl': 'Average Precision for large objects',
                'AR': 'Average Recall',
                'AR50': 'Average Recall at IoU=0.50',
                'AR75': 'Average Recall at IoU=0.75',
                'ARs': 'Average Recall for small objects',
                'ARm': 'Average Recall for medium objects',
                'ARl': 'Average Recall for large objects'
            }
        }

        # Save to JSON file
        with open(results_path, 'w') as f:
            json.dump(results_data, f, indent=2)

        print(f"\n====== Evaluation Results Saved ======")
        print(f"Results saved to: {results_path}")

        # Also save a summary file
        summary_filename = f"coco_eval_summary_{timestamp}.json"
        summary_path = os.path.join(results_dir, summary_filename)

        summary_data = {
            'timestamp': timestamp,
            'gt_json_path': gt_json_path,
            'pred_json': pred_json,
            'target_categories': target_categories,
            'summary_stats': {}
        }

        for eval_type, results in eval_results.items():
            stats = results['stats']
            summary_data['summary_stats'][eval_type] = {
                'AP': float(stats[0]),
                'AP50': float(stats[1]),
                'AP75': float(stats[2]),
                'APs': float(stats[3]),
                'APm': float(stats[4]),
                'APl': float(stats[5]),
                'AR': float(stats[6]),
                'AR50': float(stats[7]),
                'AR75': float(stats[8]),
                'ARs': float(stats[9]),
                'ARm': float(stats[10]),
                'ARl': float(stats[11])
            }

        with open(summary_path, 'w') as f:
            json.dump(summary_data, f, indent=2)

        print(f"Summary saved to: {summary_path}")

    return eval_results