VSSM Encoder¶

Visual State Space Model encoder backbone for RS3Mamba.

`models.encoders.vssm_encoder` ¶

Visual State Space Model (VSSM) Encoder for RS3Mamba.

This module contains the core Mamba components for 2D vision tasks. The implementation follows VMamba/SwinUMamba but uses mamba-ssm primitives.

Original source: https://github.com/sstary/SSRS/tree/main/RS3Mamba Paper: RS3Mamba: Visual State Space Model for Remote Sensing Image Semantic Segmentation

`PatchEmbed2D(patch_size: int = 4, in_chans: int = 3, embed_dim: int = 96, norm_layer: type[nn.Module] | None = None, **kwargs)` ¶

Bases: Module

Image to Patch Embedding.

Parameters:

Name	Type	Description	Default
`patch_size`	`int`	Patch token size. Default: 4.	`4`
`in_chans`	`int`	Number of input image channels. Default: 3.	`3`
`embed_dim`	`int`	Number of linear projection output channels. Default: 96.	`96`
`norm_layer`	`type[Module] \| None`	Normalization layer. Default: None	`None`

Source code in src/models/encoders/vssm_encoder.py

def __init__(
    self,
    patch_size: int = 4,
    in_chans: int = 3,
    embed_dim: int = 96,
    norm_layer: type[nn.Module] | None = None,
    **kwargs,
) -> None:
    super().__init__()
    if isinstance(patch_size, int):
        patch_size = (patch_size, patch_size)
    self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
    if norm_layer is not None:
        self.norm = norm_layer(embed_dim)
    else:
        self.norm = None

`PatchMerging2D(dim: int, norm_layer: type[nn.Module] = nn.LayerNorm)` ¶

Bases: Module

Patch Merging Layer for downsampling.

Parameters:

Name	Type	Description	Default
`dim`	`int`	Number of input channels.	required
`norm_layer`	`type[Module]`	Normalization layer. Default: nn.LayerNorm	`LayerNorm`

Source code in src/models/encoders/vssm_encoder.py

def __init__(self, dim: int, norm_layer: type[nn.Module] = nn.LayerNorm) -> None:
    super().__init__()
    self.dim = dim
    self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
    self.norm = norm_layer(4 * dim)

`SS2D(d_model: int, d_state: int = 16, d_conv: int = 3, expand: int = 2, dt_rank: str | int = 'auto', dt_min: float = 0.001, dt_max: float = 0.1, dt_init: str = 'random', dt_scale: float = 1.0, dt_init_floor: float = 0.0001, dropout: float = 0.0, conv_bias: bool = True, bias: bool = False, device: torch.device | None = None, dtype: torch.dtype | None = None, **kwargs)` ¶

Bases: Module

Selective Scan 2D - Core Mamba operation for 2D images.

Implements bidirectional scanning in 4 directions for capturing long-range dependencies in 2D feature maps.

Source code in src/models/encoders/vssm_encoder.py

def __init__(
    self,
    d_model: int,
    d_state: int = 16,
    d_conv: int = 3,
    expand: int = 2,
    dt_rank: str | int = "auto",
    dt_min: float = 0.001,
    dt_max: float = 0.1,
    dt_init: str = "random",
    dt_scale: float = 1.0,
    dt_init_floor: float = 1e-4,
    dropout: float = 0.0,
    conv_bias: bool = True,
    bias: bool = False,
    device: torch.device | None = None,
    dtype: torch.dtype | None = None,
    **kwargs,
) -> None:
    factory_kwargs = {"device": device, "dtype": dtype}
    super().__init__()
    self.d_model = d_model
    self.d_state = d_state
    self.d_conv = d_conv
    self.expand = expand
    self.d_inner = int(self.expand * self.d_model)
    self.dt_rank = math.ceil(self.d_model / 16) if dt_rank == "auto" else dt_rank

    self.in_proj = nn.Linear(self.d_model, self.d_inner * 2, bias=bias, **factory_kwargs)
    self.conv2d = nn.Conv2d(
        in_channels=self.d_inner,
        out_channels=self.d_inner,
        groups=self.d_inner,
        bias=conv_bias,
        kernel_size=d_conv,
        padding=(d_conv - 1) // 2,
        **factory_kwargs,
    )
    self.act = nn.SiLU()

    # x_proj for 4 directions
    self.x_proj = (
        nn.Linear(
            self.d_inner,
            (self.dt_rank + self.d_state * 2),
            bias=False,
            **factory_kwargs,
        ),
        nn.Linear(
            self.d_inner,
            (self.dt_rank + self.d_state * 2),
            bias=False,
            **factory_kwargs,
        ),
        nn.Linear(
            self.d_inner,
            (self.dt_rank + self.d_state * 2),
            bias=False,
            **factory_kwargs,
        ),
        nn.Linear(
            self.d_inner,
            (self.dt_rank + self.d_state * 2),
            bias=False,
            **factory_kwargs,
        ),
    )
    self.x_proj_weight = nn.Parameter(
        torch.stack([t.weight for t in self.x_proj], dim=0),
    )  # (K=4, N, inner)
    del self.x_proj

    # dt_proj for 4 directions
    self.dt_projs = (
        self.dt_init(
            self.dt_rank,
            self.d_inner,
            dt_scale,
            dt_init,
            dt_min,
            dt_max,
            dt_init_floor,
            **factory_kwargs,
        ),
        self.dt_init(
            self.dt_rank,
            self.d_inner,
            dt_scale,
            dt_init,
            dt_min,
            dt_max,
            dt_init_floor,
            **factory_kwargs,
        ),
        self.dt_init(
            self.dt_rank,
            self.d_inner,
            dt_scale,
            dt_init,
            dt_min,
            dt_max,
            dt_init_floor,
            **factory_kwargs,
        ),
        self.dt_init(
            self.dt_rank,
            self.d_inner,
            dt_scale,
            dt_init,
            dt_min,
            dt_max,
            dt_init_floor,
            **factory_kwargs,
        ),
    )
    self.dt_projs_weight = nn.Parameter(
        torch.stack([t.weight for t in self.dt_projs], dim=0),
    )  # (K=4, inner, rank)
    self.dt_projs_bias = nn.Parameter(
        torch.stack([t.bias for t in self.dt_projs], dim=0),
    )  # (K=4, inner)
    del self.dt_projs

    self.A_logs = self.A_log_init(self.d_state, self.d_inner, copies=4, merge=True)
    self.Ds = self.D_init(self.d_inner, copies=4, merge=True)

    self.selective_scan = selective_scan_fn

    self.out_norm = nn.LayerNorm(self.d_inner)
    self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
    self.dropout = nn.Dropout(dropout) if dropout > 0.0 else None

`VSSBlock(hidden_dim: int = 0, drop_path: float = 0, norm_layer: Callable[..., nn.Module] = partial(nn.LayerNorm, eps=1e-06), attn_drop_rate: float = 0, d_state: int = 16, **kwargs)` ¶

Bases: Module

Visual State Space Block.

Source code in src/models/encoders/vssm_encoder.py

def __init__(
    self,
    hidden_dim: int = 0,
    drop_path: float = 0,
    norm_layer: Callable[..., nn.Module] = partial(nn.LayerNorm, eps=1e-6),
    attn_drop_rate: float = 0,
    d_state: int = 16,
    **kwargs,
) -> None:
    super().__init__()
    self.ln_1 = norm_layer(hidden_dim)
    self.self_attention = SS2D(
        d_model=hidden_dim,
        dropout=attn_drop_rate,
        d_state=d_state,
        **kwargs,
    )
    self.drop_path = DropPath(drop_path)

`VSSLayer(dim: int, depth: int, attn_drop: float = 0.0, drop_path: float | list[float] = 0.0, norm_layer: type[nn.Module] = nn.LayerNorm, downsample: type[nn.Module] | None = None, use_checkpoint: bool = False, d_state: int = 16, **kwargs)` ¶

Bases: Module

A layer containing multiple VSSBlocks.

Parameters:

Name	Type	Description	Default
`dim`	`int`	Number of input channels.	required
`depth`	`int`	Number of blocks.	required
`attn_drop`	`float`	Attention dropout rate. Default: 0.0	`0.0`
`drop_path`	`float \| list[float]`	Stochastic depth rate. Default: 0.0	`0.0`
`norm_layer`	`type[Module]`	Normalization layer. Default: nn.LayerNorm	`LayerNorm`
`downsample`	`type[Module] \| None`	Downsample layer at the end. Default: None	`None`
`use_checkpoint`	`bool`	Whether to use checkpointing. Default: False	`False`
`d_state`	`int`	State dimension for Mamba. Default: 16	`16`

Source code in src/models/encoders/vssm_encoder.py

def __init__(
    self,
    dim: int,
    depth: int,
    attn_drop: float = 0.0,
    drop_path: float | list[float] = 0.0,
    norm_layer: type[nn.Module] = nn.LayerNorm,
    downsample: type[nn.Module] | None = None,
    use_checkpoint: bool = False,
    d_state: int = 16,
    **kwargs,
) -> None:
    super().__init__()
    self.dim = dim
    self.use_checkpoint = use_checkpoint

    self.blocks = nn.ModuleList(
        [
            VSSBlock(
                hidden_dim=dim,
                drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
                norm_layer=norm_layer,
                attn_drop_rate=attn_drop,
                d_state=d_state,
            )
            for i in range(depth)
        ],
    )

    def _init_weights(module: nn.Module) -> None:
        if isinstance(module, nn.Linear):
            trunc_normal_(module.weight, std=0.02)
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        elif isinstance(module, nn.LayerNorm):
            nn.init.constant_(module.bias, 0)
            nn.init.constant_(module.weight, 1.0)

    self.apply(_init_weights)

    if downsample is not None:
        self.downsample = downsample(dim=dim, norm_layer=norm_layer)
    else:
        self.downsample = None

`VSSMEncoder(patch_size: int = 4, in_chans: int = 3, depths: list[int] | None = None, dims: list[int] | None = None, d_state: int = 16, drop_rate: float = 0.0, attn_drop_rate: float = 0.0, drop_path_rate: float = 0.2, norm_layer: type[nn.Module] = nn.LayerNorm, patch_norm: bool = True, use_checkpoint: bool = False, **kwargs)` ¶

Bases: Module

Visual State Space Model Encoder.

Hierarchical encoder based on VMamba architecture.

Parameters:

Name	Type	Description	Default
`patch_size`	`int`	Patch embedding size. Default: 4	`4`
`in_chans`	`int`	Number of input channels. Default: 3	`3`
`depths`	`list[int] \| None`	Depth of each stage. Default: [2, 2, 9, 2]	`None`
`dims`	`list[int] \| None`	Dimensions at each stage. Default: [96, 192, 384, 768]	`None`
`d_state`	`int`	State dimension for Mamba. Default: 16	`16`
`drop_rate`	`float`	Dropout rate. Default: 0.0	`0.0`
`attn_drop_rate`	`float`	Attention dropout rate. Default: 0.0	`0.0`
`drop_path_rate`	`float`	Stochastic depth rate. Default: 0.2	`0.2`
`norm_layer`	`type[Module]`	Normalization layer. Default: nn.LayerNorm	`LayerNorm`
`patch_norm`	`bool`	Whether to apply norm after patch embedding. Default: True	`True`
`use_checkpoint`	`bool`	Whether to use checkpointing. Default: False	`False`

Source code in src/models/encoders/vssm_encoder.py

def __init__(
    self,
    patch_size: int = 4,
    in_chans: int = 3,
    depths: list[int] | None = None,
    dims: list[int] | None = None,
    d_state: int = 16,
    drop_rate: float = 0.0,
    attn_drop_rate: float = 0.0,
    drop_path_rate: float = 0.2,
    norm_layer: type[nn.Module] = nn.LayerNorm,
    patch_norm: bool = True,
    use_checkpoint: bool = False,
    **kwargs,
) -> None:
    super().__init__()

    if depths is None:
        depths = [2, 2, 9, 2]
    if dims is None:
        dims = [96, 192, 384, 768]

    self.num_layers = len(depths)
    if isinstance(dims, int):
        dims = [int(dims * 2**i_layer) for i_layer in range(self.num_layers)]
    self.embed_dim = dims[0]
    self.num_features = dims[-1]
    self.dims = dims

    self.patch_embed = PatchEmbed2D(
        patch_size=patch_size,
        in_chans=in_chans,
        embed_dim=self.embed_dim,
        norm_layer=norm_layer if patch_norm else None,
    )

    self.ape = False
    if self.ape:
        self.patches_resolution = self.patch_embed.patches_resolution
        self.absolute_pos_embed = nn.Parameter(
            torch.zeros(1, *self.patches_resolution, self.embed_dim),
        )
        trunc_normal_(self.absolute_pos_embed, std=0.02)
    self.pos_drop = nn.Dropout(p=drop_rate)

    dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]

    self.layers = nn.ModuleList()
    self.downsamples = nn.ModuleList()
    for i_layer in range(self.num_layers):
        layer = VSSLayer(
            dim=dims[i_layer],
            depth=depths[i_layer],
            d_state=d_state,
            drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
            norm_layer=norm_layer,
            attn_drop=attn_drop_rate,
            use_checkpoint=use_checkpoint,
        )
        self.layers.append(layer)

        if i_layer < self.num_layers - 1:
            self.downsamples.append(PatchMerging2D(dim=dims[i_layer], norm_layer=norm_layer))

    self.apply(self._init_weights)

`load_vssm_pretrained_ckpt(model: nn.Module, ckpt_path: str = './pretrain/vmamba_tiny_e292.pth') -> nn.Module` ¶

Load pretrained VMamba weights into VSSMEncoder.

Parameters:

Name	Type	Description	Default
`model`	`Module`	Model containing vssm_encoder attribute	required
`ckpt_path`	`str`	Path to pretrained weights	`'./pretrain/vmamba_tiny_e292.pth'`

Returns:

Type	Description
`Module`	Model with loaded weights

Source code in src/models/encoders/vssm_encoder.py

def load_vssm_pretrained_ckpt(
    model: nn.Module,
    ckpt_path: str = "./pretrain/vmamba_tiny_e292.pth",
) -> nn.Module:
    """Load pretrained VMamba weights into VSSMEncoder.

    Args:
        model: Model containing vssm_encoder attribute
        ckpt_path: Path to pretrained weights

    Returns:
        Model with loaded weights

    """
    print(f"Loading VSSM weights from: {ckpt_path}")
    skip_params = [
        "norm.weight",
        "norm.bias",
        "head.weight",
        "head.bias",
        "patch_embed.proj.weight",
        "patch_embed.proj.bias",
        "patch_embed.norm.weight",
        "patch_embed.norm.weight",
    ]

    ckpt = torch.load(ckpt_path, map_location="cpu")
    model_dict = model.state_dict()

    for k, v in ckpt["model"].items():
        if k in skip_params:
            print(f"Skipping weights: {k}")
            continue
        kr = f"vssm_encoder.{k}"
        if "downsample" in kr:
            i_ds = int(re.findall(r"layers\.(\d+)\.downsample", kr)[0])
            kr = kr.replace(f"layers.{i_ds}.downsample", f"downsamples.{i_ds}")
            assert kr in model_dict.keys()
        if kr in model_dict:
            if model_dict[kr].shape == v.shape:
                model_dict[kr] = v
            else:
                print(f"Shape mismatch for {kr}: {model_dict[kr].shape} vs {v.shape}")

    model.load_state_dict(model_dict, strict=False)
    return model

VSSM Encoder¶

models.encoders.vssm_encoder ¶

PatchEmbed2D(patch_size: int = 4, in_chans: int = 3, embed_dim: int = 96, norm_layer: type[nn.Module] | None = None, **kwargs) ¶

PatchMerging2D(dim: int, norm_layer: type[nn.Module] = nn.LayerNorm) ¶

VSSBlock(hidden_dim: int = 0, drop_path: float = 0, norm_layer: Callable[..., nn.Module] = partial(nn.LayerNorm, eps=1e-06), attn_drop_rate: float = 0, d_state: int = 16, **kwargs) ¶

VSSLayer(dim: int, depth: int, attn_drop: float = 0.0, drop_path: float | list[float] = 0.0, norm_layer: type[nn.Module] = nn.LayerNorm, downsample: type[nn.Module] | None = None, use_checkpoint: bool = False, d_state: int = 16, **kwargs) ¶

load_vssm_pretrained_ckpt(model: nn.Module, ckpt_path: str = './pretrain/vmamba_tiny_e292.pth') -> nn.Module ¶

`models.encoders.vssm_encoder` ¶

`PatchEmbed2D(patch_size: int = 4, in_chans: int = 3, embed_dim: int = 96, norm_layer: type[nn.Module] | None = None, **kwargs)` ¶

`PatchMerging2D(dim: int, norm_layer: type[nn.Module] = nn.LayerNorm)` ¶

`VSSBlock(hidden_dim: int = 0, drop_path: float = 0, norm_layer: Callable[..., nn.Module] = partial(nn.LayerNorm, eps=1e-06), attn_drop_rate: float = 0, d_state: int = 16, **kwargs)` ¶

`VSSLayer(dim: int, depth: int, attn_drop: float = 0.0, drop_path: float | list[float] = 0.0, norm_layer: type[nn.Module] = nn.LayerNorm, downsample: type[nn.Module] | None = None, use_checkpoint: bool = False, d_state: int = 16, **kwargs)` ¶

`load_vssm_pretrained_ckpt(model: nn.Module, ckpt_path: str = './pretrain/vmamba_tiny_e292.pth') -> nn.Module` ¶