#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/deimv2/modular_deimv2.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_deimv2.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from huggingface_hub.dataclasses import strict

from ...backbone_utils import consolidate_backbone_kwargs_to_config
from ...configuration_utils import PreTrainedConfig
from ...utils import auto_docstring
from ..auto import AutoConfig


# TODO: Attribute map assignment logic should be fixed in modular
# as well as super() call parsing because otherwise we cannot re-write args after initialization
@auto_docstring(checkpoint="Intellindust/DEIMv2_HGNetv2_N_COCO")
@strict
class Deimv2Config(PreTrainedConfig):
    r"""
    initializer_bias_prior_prob (`float`, *optional*):
        The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
        If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
    freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
        Whether to freeze the batch normalization layers in the backbone.
    encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
        Multi level features input for encoder.
    feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
        Strides used in each feature map.
    encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
        Indexes of the projected layers to be used in the encoder.
    positional_encoding_temperature (`int`, *optional*, defaults to 10000):
        The temperature parameter used to create the positional encodings.
    encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler.
    eval_size (`list[int]` or `tuple[int, int]`, *optional*):
        Height and width used to computes the effective height and width of the position embeddings after taking
        into account the stride.
    normalize_before (`bool`, *optional*, defaults to `False`):
        Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
        feed-forward modules.
    hidden_expansion (`float`, *optional*, defaults to 1.0):
        Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
    num_queries (`int`, *optional*, defaults to 300):
        Number of object queries.
    decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
        Multi level features dimension for decoder.
    num_feature_levels (`int`, *optional*, defaults to 3):
        The number of input feature levels.
    decoder_n_points (`int`, *optional*, defaults to 4):
        The number of sampled keys in each feature level for each attention head in the decoder.
    decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
        The non-linear activation function (function or string) in the decoder.
    num_denoising (`int`, *optional*, defaults to 100):
        The total number of denoising tasks or queries to be used for contrastive denoising.
    label_noise_ratio (`float`, *optional*, defaults to 0.5):
        The fraction of denoising labels to which random noise should be added.
    box_noise_scale (`float`, *optional*, defaults to 1.0):
        Scale or magnitude of noise to be added to the bounding boxes.
    learn_initial_query (`bool`, *optional*, defaults to `False`):
        Indicates whether the initial query embeddings for the decoder should be learned during training.
    anchor_image_size (`tuple[int, int]`, *optional*):
        Height and width of the input image used during evaluation to generate the bounding box anchors.
    with_box_refine (`bool`, *optional*, defaults to `True`):
        Whether to apply iterative bounding box refinement.
    matcher_alpha (`float`, *optional*, defaults to 0.25):
        Parameter alpha used by the Hungarian Matcher.
    matcher_gamma (`float`, *optional*, defaults to 2.0):
        Parameter gamma used by the Hungarian Matcher.
    matcher_class_cost (`float`, *optional*, defaults to 2.0):
        The relative weight of the class loss used by the Hungarian Matcher.
    matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
        The relative weight of the bounding box loss used by the Hungarian Matcher.
    matcher_giou_cost (`float`, *optional*, defaults to 2.0):
        The relative weight of the giou loss of used by the Hungarian Matcher.
    use_focal_loss (`bool`, *optional*, defaults to `True`):
        Parameter informing if focal loss should be used.
    focal_loss_alpha (`float`, *optional*, defaults to 0.75):
        Parameter alpha used to compute the focal loss.
    focal_loss_gamma (`float`, *optional*, defaults to 2.0):
        Parameter gamma used to compute the focal loss.
    weight_loss_vfl (`float`, *optional*, defaults to 1.0):
        Relative weight of the varifocal loss in the object detection loss.
    weight_loss_bbox (`float`, *optional*, defaults to 5.0):
        Relative weight of the L1 bounding box loss in the object detection loss.
    weight_loss_giou (`float`, *optional*, defaults to 2.0):
        Relative weight of the generalized IoU loss in the object detection loss.
    weight_loss_fgl (`float`, *optional*, defaults to 0.15):
        Relative weight of the fine-grained localization loss in the object detection loss.
    weight_loss_ddf (`float`, *optional*, defaults to 1.5):
        Relative weight of the decoupled distillation focal loss in the object detection loss.
    eval_idx (`int`, *optional*, defaults to -1):
        Index of the decoder layer to use for evaluation.
    layer_scale (`float`, *optional*, defaults to `1.0`):
        Scaling factor for the hidden dimension in later decoder layers.
    max_num_bins (`int`, *optional*, defaults to 32):
        Maximum number of bins for the distribution-guided bounding box refinement.
    reg_scale (`float`, *optional*, defaults to 4.0):
        Scale factor for the regression distribution.
    depth_mult (`float`, *optional*, defaults to 1.0):
        Multiplier for the number of blocks in RepNCSPELAN5 layers.
    top_prob_values (`int`, *optional*, defaults to 4):
        Number of top probability values to consider from each corner's distribution.
    lqe_hidden_dim (`int`, *optional*, defaults to 64):
        Hidden dimension size for the Location Quality Estimator (LQE) network.
    lqe_layers (`int`, *optional*, defaults to 2):
        Number of layers in the Location Quality Estimator MLP.
    decoder_offset_scale (`float`, *optional*, defaults to 0.5):
        Offset scale used in deformable attention.
    decoder_method (`str`, *optional*, defaults to `"default"`):
        The method to use for the decoder: `"default"` or `"discrete"`.
    up (`float`, *optional*, defaults to 0.5):
        Controls the upper bounds of the Weighting Function.
    weight_loss_mal (`float`, *optional*, defaults to 1.0):
        Relative weight of the matching auxiliary loss in the object detection loss.
    use_dense_one_to_one (`bool`, *optional*, defaults to `True`):
        Whether to use dense one-to-one matching across decoder layers.
    mal_alpha (`float`, *optional*):
        Alpha parameter for the Matching Auxiliary Loss (MAL). If `None`, uses `focal_loss_alpha`.
    encoder_fuse_op (`str`, *optional*, defaults to `"sum"`):
        Fusion operation used in the encoder FPN. DEIMv2 uses `"sum"` instead of D-FINE's `"cat"`.
    spatial_tuning_adapter_inplanes (`int`, *optional*, defaults to 16):
        Number of input planes for the STA convolutional stem.
    encoder_type (`str`, *optional*, defaults to `"hybrid"`):
        Type of encoder to use. `"hybrid"` uses the full HybridEncoder with AIFI, FPN, and PAN.
        `"lite"` uses the lightweight LiteEncoder with GAP fusion for smaller variants (Atto, Femto, Pico).
    use_gateway (`bool`, *optional*, defaults to `True`):
        Whether to use the gateway mechanism (cross-attention gating) in decoder layers. When `False`,
        uses RMSNorm on the encoder attention output instead.
    share_bbox_head (`bool`, *optional*, defaults to `False`):
        Whether to share the bounding box prediction head across all decoder layers.
    encoder_has_trailing_conv (`bool`, *optional*, defaults to `True`):
        Whether the encoder's CSP blocks include a trailing 3x3 convolution after the bottleneck path.
        `True` for RepNCSPELAN4 (used by HGNetV2 N and LiteEncoder variants).
        `False` for RepNCSPELAN5 (used by DINOv3 variants).
    """

    model_type = "deimv2"
    sub_configs = {"backbone_config": AutoConfig}
    layer_types = ["basic", "bottleneck"]
    attribute_map = {
        "hidden_size": "d_model",
        "num_attention_heads": "encoder_attention_heads",
    }

    initializer_range: float = 0.01
    initializer_bias_prior_prob: float | None = None
    layer_norm_eps: float = 1e-5
    batch_norm_eps: float = 1e-5
    backbone_config: dict | PreTrainedConfig | None = None
    freeze_backbone_batch_norms: bool = True

    # encoder HybridEncoder
    encoder_hidden_dim: int = 256
    encoder_in_channels: list[int] | tuple[int, ...] = (512, 1024, 2048)
    feat_strides: list[int] | tuple[int, ...] = (8, 16, 32)
    encoder_layers: int = 1
    encoder_ffn_dim: int = 1024
    encoder_attention_heads: int = 8
    dropout: float | int = 0.0
    activation_dropout: float | int = 0.0
    encode_proj_layers: list[int] | tuple[int, ...] = (2,)
    positional_encoding_temperature: int = 10000
    encoder_activation_function: str = "gelu"
    activation_function: str = "silu"

    eval_size: list[int] | tuple[int, int] | None = None
    normalize_before: bool = False
    hidden_expansion: float = 1.0

    # decoder Deimv2Transformer
    d_model: int = 256
    num_queries: int = 300
    decoder_in_channels: list[int] | tuple[int, ...] = (256, 256, 256)
    decoder_ffn_dim: int = 1024
    num_feature_levels: int = 3
    decoder_n_points: int | list[int] = 4
    decoder_layers: int = 6
    decoder_attention_heads: int = 8
    decoder_activation_function: str = "relu"
    attention_dropout: float | int = 0.0
    num_denoising: int = 100
    label_noise_ratio: float = 0.5
    box_noise_scale: float = 1.0
    learn_initial_query: bool = False
    anchor_image_size: int | list[int] | None = None
    with_box_refine: bool = True

    # Loss
    matcher_alpha: float = 0.25
    matcher_gamma: float = 2.0
    matcher_class_cost: float = 2.0
    matcher_bbox_cost: float = 5.0
    matcher_giou_cost: float = 2.0
    use_focal_loss: bool = True
    auxiliary_loss: bool = True
    focal_loss_alpha: float = 0.75
    focal_loss_gamma: float = 2.0
    weight_loss_vfl: float = 1.0
    weight_loss_bbox: float = 5.0
    weight_loss_giou: float = 2.0
    weight_loss_fgl: float = 0.15
    weight_loss_ddf: float = 1.5
    eos_coefficient: float = 1e-4
    eval_idx: int = -1
    layer_scale: int | float = 1.0
    max_num_bins: int = 32
    reg_scale: float = 4.0
    depth_mult: float = 1.0
    top_prob_values: int = 4
    lqe_hidden_dim: int = 64
    lqe_layers: int = 2
    decoder_offset_scale: float = 0.5
    decoder_method: str = "default"
    up: float = 0.5
    tie_word_embeddings: bool = True
    is_encoder_decoder: bool = True
    weight_loss_mal: float = 1.0
    use_dense_one_to_one: bool = True
    mal_alpha: float | None = None
    encoder_fuse_op: str = "sum"
    spatial_tuning_adapter_inplanes: int = 16
    encoder_type: str = "hybrid"
    use_gateway: bool = True
    share_bbox_head: bool = False
    encoder_has_trailing_conv: bool = True

    def __post_init__(self, **kwargs):
        self.backbone_config, kwargs = consolidate_backbone_kwargs_to_config(
            backbone_config=self.backbone_config,
            default_config_type="hgnet_v2",
            default_config_kwargs={"out_indices": [2, 3, 4]},
            **kwargs,
        )
        self.head_dim = self.d_model // self.decoder_attention_heads
        super().__post_init__(**kwargs)

    def validate_architecture(self):
        """Part of `@strict`-powered validation. Validates the architecture of the config."""
        if isinstance(self.decoder_n_points, list):
            if len(self.decoder_n_points) != self.num_feature_levels:
                raise ValueError(
                    f"Length of decoder_n_points list ({len(self.decoder_n_points)}) must match num_feature_levels ({self.num_feature_levels})."
                )

        if self.head_dim * self.decoder_attention_heads != self.d_model:
            raise ValueError(
                f"Embedded dimension {self.d_model} must be divisible by decoder_attention_heads {self.decoder_attention_heads}"
            )


__all__ = ["Deimv2Config"]
