Skip to content

Multimodal Pipeline

Training and evaluation pipeline for multimodal late fusion of aerial and Sentinel-2 models.

pipeline.multimodal_pipeline

Training and evaluation pipeline for multimodal fusion experiments.

This pipeline handles training with both aerial and Sentinel-2 data for the MultimodalLateFusion model architecture.

MultimodalTrainEvalPipeline(run_name: str | None = None, logs_dir: str | None = None)

Pipeline for multimodal fusion training and evaluation.

Initialize the multimodal pipeline.

Source code in src/pipeline/multimodal_pipeline.py
448
449
450
451
452
453
454
def __init__(self, run_name: str | None = None, logs_dir: str | None = None) -> None:
    """Initialize the multimodal pipeline."""
    timestamp = datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S")
    safe_name = re.sub(r"[^A-Za-z0-9._-]+", "_", run_name).strip("_") if run_name else ""
    run_suffix = f"_{safe_name}" if safe_name else ""
    logs_path = Path(logs_dir).expanduser().resolve() if logs_dir else Path.cwd()
    self.log_file = logs_path / f"multimodal_pipeline_{timestamp}{run_suffix}.log"

run(config: dict[str, Any], *, no_stdout_logs: bool = False) -> None

Execute the multimodal training and evaluation pipeline.

Source code in src/pipeline/multimodal_pipeline.py
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
def run(self, config: dict[str, Any], *, no_stdout_logs: bool = False) -> None:
    """Execute the multimodal training and evaluation pipeline."""
    mlflow_cfg = config["mlflow"]

    init_mlflow(
        tracking_uri=mlflow_cfg.get("tracking_uri"),
        experiment_name=mlflow_cfg["name"],
        dagshub_config=mlflow_cfg.get("dagshub"),
    )

    setup_logging(
        log_file=self.log_file,
        log_formatter=LOG_FORMATTER,
        no_stdout_logs=no_stdout_logs,
    )

    exp_cfg = config.get("experiment", {})
    seed = int(exp_cfg.get("seed", 42))
    deterministic = bool(exp_cfg.get("deterministic", True))
    seed_everything(seed=seed, deterministic=deterministic)

    logger.info("Starting multimodal fusion train-evaluation pipeline.")

    with mlflow.start_run(run_name=config["mlflow"]["run_name"]):
        mlflow.log_dict(config, artifact_file="config_resolved.json")

        mlflow.log_params(
            {
                "seed": seed,
                "deterministic": deterministic,
                "model_type": config["model"]["model_type"],
                "fusion_mode": config["model"].get("fusion_mode", "weighted"),
                "freeze_encoders": config["model"].get("freeze_encoders", True),
                "use_cloud_uncertainty": config["model"].get("use_cloud_uncertainty", False),
            }
        )

        mlflow.log_params(
            {
                "optimizer": config["training"]["optimizer"]["type"],
                "learning_rate": config["training"]["optimizer"]["learning_rate"],
                "loss_function": config["training"]["loss_function"]["type"],
                "epochs": config["training"]["epochs"],
                "batch_size": config["data"]["batch_size"],
            }
        )

        if note := config["mlflow"].get("note"):
            mlflow.set_tag("note", note)
        mlflow.set_tag("data_type", "multimodal_fusion")

        # Create datasets
        data_cfg = config["data"]

        norm_cfg = data_cfg.get("normalization")
        image_transform = None
        if norm_cfg is not None and norm_cfg.get("enabled", True):
            image_transform = MultiChannelNormalize(
                mean=norm_cfg["mean"],
                std=norm_cfg["std"],
                scale_to_unit=norm_cfg.get("scale_to_unit"),
                elevation_range=tuple(norm_cfg["elevation_range"])
                if norm_cfg.get("elevation_range") is not None
                else None,
                elevation_channel_index=norm_cfg.get("elevation_channel_index"),
            )

        selected_channels = data_cfg.get("selected_channels")

        train_dataset = FlairMultimodalDataset(
            image_dir=data_cfg["train"]["images"],
            mask_dir=data_cfg["train"]["masks"],
            sentinel_dir=data_cfg["train"]["sentinel"],
            centroids_path=data_cfg["centroids_path"],
            num_classes=data_cfg["num_classes"],
            image_transform=image_transform,
            selected_channels=selected_channels,
            sentinel_patch_size=data_cfg.get("sentinel_patch_size", 10),
            context_size=data_cfg.get("context_size"),
            use_monthly_average=data_cfg.get("use_monthly_average", True),
            cloud_snow_cover_threshold=data_cfg.get("cloud_snow_cover_threshold", 0.6),
            cloud_snow_prob_threshold=data_cfg.get("cloud_snow_prob_threshold", 50),
            sentinel_scale_factor=data_cfg.get("sentinel_scale_factor", 10000.0),
            sentinel_mean=data_cfg.get("sentinel_mean"),
            sentinel_std=data_cfg.get("sentinel_std"),
        )

        val_dataset = FlairMultimodalDataset(
            image_dir=data_cfg["val"]["images"],
            mask_dir=data_cfg["val"]["masks"],
            sentinel_dir=data_cfg["val"]["sentinel"],
            centroids_path=data_cfg["centroids_path"],
            num_classes=data_cfg["num_classes"],
            image_transform=image_transform,
            selected_channels=selected_channels,
            sentinel_patch_size=data_cfg.get("sentinel_patch_size", 10),
            context_size=data_cfg.get("context_size"),
            use_monthly_average=data_cfg.get("use_monthly_average", True),
            cloud_snow_cover_threshold=data_cfg.get("cloud_snow_cover_threshold", 0.6),
            cloud_snow_prob_threshold=data_cfg.get("cloud_snow_prob_threshold", 50),
            sentinel_scale_factor=data_cfg.get("sentinel_scale_factor", 10000.0),
            sentinel_mean=data_cfg.get("sentinel_mean"),
            sentinel_std=data_cfg.get("sentinel_std"),
        )

        test_dataset = FlairMultimodalDataset(
            image_dir=data_cfg["test"]["images"],
            mask_dir=data_cfg["test"]["masks"],
            sentinel_dir=data_cfg["test"]["sentinel"],
            centroids_path=data_cfg["centroids_path"],
            num_classes=data_cfg["num_classes"],
            image_transform=image_transform,
            selected_channels=selected_channels,
            sentinel_patch_size=data_cfg.get("sentinel_patch_size", 10),
            context_size=data_cfg.get("context_size"),
            use_monthly_average=data_cfg.get("use_monthly_average", True),
            cloud_snow_cover_threshold=data_cfg.get("cloud_snow_cover_threshold", 0.6),
            cloud_snow_prob_threshold=data_cfg.get("cloud_snow_prob_threshold", 50),
            sentinel_scale_factor=data_cfg.get("sentinel_scale_factor", 10000.0),
            sentinel_mean=data_cfg.get("sentinel_mean"),
            sentinel_std=data_cfg.get("sentinel_std"),
        )

        generator = create_generator(seed)
        num_workers = data_cfg.get("num_workers", 4)

        train_loader = DataLoader(
            train_dataset,
            batch_size=data_cfg["batch_size"],
            shuffle=True,
            num_workers=num_workers,
            worker_init_fn=seed_worker,
            generator=generator,
            persistent_workers=bool(num_workers > 0),
            collate_fn=multimodal_collate_fn,
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=data_cfg["batch_size"],
            shuffle=False,
            num_workers=num_workers,
            worker_init_fn=seed_worker,
            persistent_workers=bool(num_workers > 0),
            collate_fn=multimodal_collate_fn,
        )

        test_loader = DataLoader(
            test_dataset,
            batch_size=data_cfg["batch_size"],
            shuffle=False,
            num_workers=num_workers,
            worker_init_fn=seed_worker,
            persistent_workers=bool(num_workers > 0),
            collate_fn=multimodal_collate_fn,
        )

        # Build model
        device = torch.device(
            "cuda:0"
            if torch.cuda.is_available() and config["training"]["device"] == "cuda"
            else "cpu",
        )
        logger.info(
            "Using device: %s",
            torch.cuda.get_device_name(0) if device.type == "cuda" else "CPU",
        )

        model = build_model(
            model_type=config["model"]["model_type"],
            encoder_name=config["model"].get("encoder_name", ""),
            encoder_weights=config["model"].get("encoder_weights"),
            in_channels=config["model"].get("aerial_in_channels", 5),
            n_classes=data_cfg["num_classes"],
            model_config=config["model"],
        )

        criterion = build_loss_function(
            loss_type=config["training"]["loss_function"]["type"],
            kwargs=config["training"]["loss_function"].get("args", {}),
        )

        model.to(device)
        criterion.to(device)

        # Compute and log model complexity (MACs, FLOPs, Params)
        try:
            # Use aerial resolution roughly for input size reference
            aerial_res = tuple(config["model"].get("aerial_resolution", [512, 512]))
            input_size = (1, config["model"].get("aerial_in_channels", 5), *aerial_res)
            sentinel_cfg = config["model"].get("sentinel_model_config", {})
            sentinel_image_size = sentinel_cfg.get(
                "image_size",
                data_cfg.get("sentinel_patch_size", 10),
            )
            sentinel_seq_len = sentinel_cfg.get("max_seq_len")
            if sentinel_seq_len is None:
                sentinel_seq_len = 12
            sentinel_in_channels = config["model"].get("sentinel_in_channels")
            if sentinel_in_channels is None:
                sentinel_in_channels = 10
            sentinel_input_size = (
                1,
                int(sentinel_seq_len),
                int(sentinel_in_channels),
                int(sentinel_image_size),
                int(sentinel_image_size),
            )

            complexity_metrics = compute_model_complexity(
                model,
                input_size,
                sentinel_input_size=sentinel_input_size,
            )
            mlflow.log_metrics(complexity_metrics)
            logger.info("Model Complexity: %s", complexity_metrics)
        except Exception as e:  # noqa: BLE001
            logger.warning("Failed to compute model complexity: %s", e)

        optimizer = build_optimizer(
            model=model,
            optimizer_type=config["training"]["optimizer"]["type"],
            learning_rate=config["training"]["optimizer"]["learning_rate"],
            weight_decay=config["training"]["optimizer"].get("weight_decay", 0.0001),
            betas=config["training"]["optimizer"].get("betas", [0.9, 0.999]),
        )

        accumulation_steps = config["training"].get("accumulation_steps", 1)
        optimizer_steps_per_epoch = (
            len(train_loader) + accumulation_steps - 1
        ) // accumulation_steps

        lr_scheduler = build_lr_scheduler(
            optimizer=optimizer,
            scheduler_config=config["training"].get("lr_scheduler"),
            steps_per_epoch=optimizer_steps_per_epoch,
            epochs=config["training"]["epochs"],
        )

        logger.info("Starting multimodal fusion training")
        logger.info("Fusion mode: %s", config["model"].get("fusion_mode", "weighted"))
        logger.info("Freeze encoders: %s", config["model"].get("freeze_encoders", True))

        train_multimodal(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=lr_scheduler,
            device=device,
            epochs=config["training"]["epochs"],
            patience=config["training"]["early_stopping_patience"],
            num_classes=data_cfg["num_classes"],
            other_class_index=data_cfg.get("other_class_index"),
            accumulation_steps=accumulation_steps,
            early_stopping_criterion=config["training"].get(
                "early_stopping_criterion",
                "miou",
            ),
            use_amp=config["training"].get("use_amp", False),
        )

        logger.info("Training finished. Evaluating on test set...")

        # Run full evaluation with confusion matrix and per-class metrics
        eval_metrics = _evaluate_multimodal_model(
            model=model,
            loader=test_loader,
            device=device,
            num_classes=data_cfg["num_classes"],
            ignore_index=data_cfg.get("other_class_index"),
            log_eval_metrics=True,
            log_confusion_matrix=True,
            class_name_mapping=None,
        )

        # Explicitly log per-class metrics to console for visibility
        logger.info("Per-Class Metrics:")
        for k, v in eval_metrics.items():
            if "iou_class_" in k or "f1_class_" in k:
                logger.info("  %s: %.4f", k, v)

        if "total_inference_time" in eval_metrics:
            logger.info(
                "Timing: total_inference_time=%.3fs, avg_time_per_image=%.6fs, avg_time_per_batch=%.6fs",
                eval_metrics.get("total_inference_time", 0.0),
                eval_metrics.get("avg_time_per_image", 0.0),
                eval_metrics.get("avg_time_per_batch", 0.0),
            )

        # Log prediction mosaic if configured
        mosaic_cfg = config.get("evaluation", {}).get("mosaic") or config.get(
            "validation", {}
        ).get("mosaic")
        if mosaic_cfg and mosaic_cfg.get("enabled", False):
            logger.info("Generating prediction mosaic...")
            log_prediction_mosaic_to_mlflow(
                model=model,
                data_loader=test_loader,
                device=device,
                num_classes=data_cfg["num_classes"],
                zone_name=mosaic_cfg.get("zone_name", "test_zone"),
                grid_size=mosaic_cfg.get("grid_size", 10),
                patch_size=mosaic_cfg.get("patch_size", 512),
            )

        logger.info("Multimodal pipeline completed successfully.")

add_train_eval_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser

Add arguments for the multimodal pipeline.

Source code in src/pipeline/multimodal_pipeline.py
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
def add_train_eval_arguments(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
    """Add arguments for the multimodal pipeline."""
    parser.add_argument(
        "-c",
        "--config",
        type=str,
        required=True,
        help="Path to multimodal pipeline configuration file.",
    )
    parser.add_argument(
        "-l",
        "--logs-dir",
        type=str,
        default=None,
        help="Directory to write pipeline logs.",
    )
    parser.add_argument(
        "-q",
        "--no-stdout-logs",
        required=False,
        action="store_true",
        help="Suppress logging output in the terminal.",
    )
    return parser

main() -> None

Run the multimodal training/evaluation CLI entrypoint.

Source code in src/pipeline/multimodal_pipeline.py
807
808
809
810
811
812
def main() -> None:
    """Run the multimodal training/evaluation CLI entrypoint."""
    parser = argparse.ArgumentParser(description="Multimodal fusion training pipeline")
    parser = add_train_eval_arguments(parser)
    args = parser.parse_args()
    run_train_eval(args)

run_train_eval(args: argparse.Namespace) -> None

Run the multimodal pipeline with the provided configuration.

Source code in src/pipeline/multimodal_pipeline.py
792
793
794
795
796
797
798
799
800
801
802
803
804
def run_train_eval(args: argparse.Namespace) -> None:
    """Run the multimodal pipeline with the provided configuration."""
    config_file = Path(args.config)
    if not config_file.is_file():
        msg = f"config path {config_file} does not exist"
        raise ValueError(msg)
    config = read_yaml(config_file)

    pipeline = MultimodalTrainEvalPipeline(
        run_name=config["mlflow"]["run_name"],
        logs_dir=args.logs_dir,
    )
    pipeline.run(config, no_stdout_logs=args.no_stdout_logs)

train_multimodal(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, criterion: nn.Module, optimizer: torch.optim.Optimizer, device: torch.device, scheduler: LRScheduler | None = None, epochs: int = 100, patience: int = 20, num_classes: int = 13, other_class_index: int | None = None, accumulation_steps: int = 1, early_stopping_criterion: str = 'miou', *, use_amp: bool = False, log_evaluation_metrics: bool = True, log_model: bool = True, max_grad_norm: float | None = None) -> dict[str, Any]

Train a multimodal fusion model.

Parameters:

Name Type Description Default
model Module

MultimodalLateFusion model.

required
train_loader DataLoader

DataLoader for training data.

required
val_loader DataLoader

DataLoader for validation data.

required
criterion Module

Loss function.

required
optimizer Optimizer

Optimizer.

required
device device

Training device.

required
scheduler LRScheduler | None

Optional learning rate scheduler.

None
epochs int

Maximum epochs.

100
patience int

Early stopping patience.

20
num_classes int

Number of classes.

13
other_class_index int | None

Index of "other" class to ignore.

None
accumulation_steps int

Gradient accumulation steps.

1
early_stopping_criterion str

'loss' or 'miou'.

'miou'
use_amp bool

Whether to use AMP.

False
log_evaluation_metrics bool

Whether to log to MLflow.

True
log_model bool

Whether to log model to MLflow.

True
max_grad_norm float | None

Optional gradient clipping.

None

Returns:

Type Description
dict[str, Any]

Dictionary with training history and best metrics.

Source code in src/pipeline/multimodal_pipeline.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
def train_multimodal(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    criterion: nn.Module,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
    scheduler: LRScheduler | None = None,
    epochs: int = 100,
    patience: int = 20,
    num_classes: int = 13,
    other_class_index: int | None = None,
    accumulation_steps: int = 1,
    early_stopping_criterion: str = "miou",
    *,
    use_amp: bool = False,
    log_evaluation_metrics: bool = True,
    log_model: bool = True,
    max_grad_norm: float | None = None,
) -> dict[str, Any]:
    """Train a multimodal fusion model.

    Args:
        model: MultimodalLateFusion model.
        train_loader: DataLoader for training data.
        val_loader: DataLoader for validation data.
        criterion: Loss function.
        optimizer: Optimizer.
        device: Training device.
        scheduler: Optional learning rate scheduler.
        epochs: Maximum epochs.
        patience: Early stopping patience.
        num_classes: Number of classes.
        other_class_index: Index of "other" class to ignore.
        accumulation_steps: Gradient accumulation steps.
        early_stopping_criterion: 'loss' or 'miou'.
        use_amp: Whether to use AMP.
        log_evaluation_metrics: Whether to log to MLflow.
        log_model: Whether to log model to MLflow.
        max_grad_norm: Optional gradient clipping.

    Returns:
        Dictionary with training history and best metrics.

    """
    if early_stopping_criterion not in ("loss", "miou"):
        msg = f"early_stopping_criterion must be 'loss' or 'miou', got {early_stopping_criterion!r}"
        raise ValueError(msg)

    model.to(device)

    best_val_loss = float("inf")
    best_val_miou = 0.0
    no_improve = 0
    losses_train: list[float] = []
    losses_val: list[float] = []
    mious_val: list[float] = []
    best_model_state = None

    is_step_scheduler = scheduler is not None and not isinstance(scheduler, ReduceLROnPlateau)
    step_scheduler = scheduler if is_step_scheduler else None

    for epoch in range(epochs):
        loss_epoch = _train_epoch_multimodal(
            model=model,
            loader=train_loader,
            criterion=criterion,
            optimizer=optimizer,
            device=device,
            accumulation_steps=accumulation_steps,
            use_amp=use_amp,
            scheduler=step_scheduler,
            max_grad_norm=max_grad_norm,
        )
        losses_train.append(loss_epoch)
        logger.info("Epoch %d/%d: Training Loss: %.4f", epoch + 1, epochs, loss_epoch)

        val_loss, val_miou = _validate_epoch_multimodal(
            model=model,
            loader=val_loader,
            criterion=criterion,
            device=device,
            num_classes=num_classes,
            ignore_index=other_class_index,
        )
        losses_val.append(val_loss)
        mious_val.append(val_miou)
        logger.info(
            "Epoch %d/%d: Validation Loss: %.4f, Validation mIoU: %.4f",
            epoch + 1,
            epochs,
            val_loss,
            val_miou,
        )

        if log_evaluation_metrics:
            log_metrics_to_mlflow(
                metrics={"train_loss": loss_epoch, "val_loss": val_loss, "val_miou": val_miou},
                step=epoch,
            )

        if early_stopping_criterion == "miou":
            improved = val_miou > best_val_miou
        else:
            improved = val_loss < best_val_loss

        if improved:
            best_val_miou = val_miou
            best_val_loss = val_loss
            no_improve = 0
            if early_stopping_criterion == "miou":
                logger.info("Validation mIoU improved to %.4f", best_val_miou)
            else:
                logger.info("Validation loss improved to %.4f", best_val_loss)
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        else:
            no_improve += 1
            logger.info("No improvement for %d epochs.", no_improve)
            if no_improve >= patience:
                logger.info("Early stopping at epoch %d.", epoch + 1)
                break

        if scheduler is not None and isinstance(scheduler, ReduceLROnPlateau):
            scheduler.step(val_loss)

        if log_evaluation_metrics:
            current_lr = optimizer.param_groups[0]["lr"]
            mlflow.log_metric("learning_rate", current_lr, step=epoch)

    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        model.to(device)

        # Log fusion weights if available
        if hasattr(model, "get_fusion_weights"):
            weights = model.get_fusion_weights()
            if weights:
                logger.info(
                    "Learned fusion weights (aerial per class): %s", weights.get("aerial_weights")
                )
                logger.info(
                    "Learned fusion weights (sentinel per class): %s",
                    weights.get("sentinel_weights"),
                )

                # Save fusion weights to MLflow as a JSON artifact
                weights_json = {
                    k: v.cpu().tolist() if isinstance(v, torch.Tensor) else v
                    for k, v in weights.items()
                }
                mlflow.log_dict(weights_json, "fusion_weights.json")

    return {
        "train_loss": losses_train,
        "val_loss": losses_val,
        "val_miou": mious_val,
        "best_val_loss": best_val_loss,
        "best_val_miou": best_val_miou,
    }