#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
#           This file was automatically generated from src/transformers/models/slanext/modular_slanext.py.
#               Do NOT edit this file manually as any edits will be overwritten by the generation of
#             the file from the modular. If any change should be done, please apply the change to the
#                          modular_slanext.py file directly. One of our CI enforces this.
#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# Copyright 2026 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import torch
import torchvision.transforms.v2.functional as tvF

from ...image_processing_backends import TorchvisionBackend
from ...image_processing_utils import BatchFeature
from ...image_transforms import group_images_by_shape, reorder_images
from ...image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, SizeDict
from ...processing_utils import ImagesKwargs, Unpack
from ...utils import auto_docstring, is_torchdynamo_compiling, logging
from ...utils.generic import TensorType
from ...utils.import_utils import requires


logger = logging.get_logger(__name__)


@auto_docstring
@requires(backends=("torch",))
class SLANeXtImageProcessor(TorchvisionBackend):
    resample = 2  # PILImageResampling.BILINEAR
    image_mean = IMAGENET_DEFAULT_MEAN
    image_std = IMAGENET_DEFAULT_STD
    size = {"height": 512, "width": 512}
    pad_size = {"height": 512, "width": 512}
    do_convert_rgb = True
    do_resize = True
    do_rescale = True
    do_normalize = True
    do_pad = True

    def _resize(
        self,
        image: "torch.Tensor",
        size: SizeDict,
    ) -> "torch.Tensor":
        batch_size, channels, height, width = image.shape
        image = image.view(batch_size * channels, height, width)

        device = image.device

        scale = max(size.height, size.width) / max(height, width)
        target_height = round(height * scale)
        target_width = round(width * scale)

        target_col = torch.arange(target_width, dtype=torch.float32, device=device)
        src_col = (target_col + 0.5) * (float(width) / float(target_width)) - 0.5
        src_col_floor = src_col.floor().to(torch.int32)
        src_col_frac = src_col - src_col_floor.float()
        # boundary handling
        src_col_frac = torch.where(src_col_floor < 0, torch.zeros_like(src_col_frac), src_col_frac)
        src_col_floor = torch.where(src_col_floor < 0, torch.zeros_like(src_col_floor), src_col_floor)
        src_col_frac = torch.where(src_col_floor >= width - 1, torch.ones_like(src_col_frac), src_col_frac)
        src_col_floor = torch.where(
            src_col_floor >= width - 1, torch.full_like(src_col_floor, width - 2), src_col_floor
        )
        # fixed-point weights
        weight_right = (src_col_frac * 2048 + 0.5).floor().to(torch.int32)  # round-to-nearest
        weight_left = 2048 - weight_right  # (target_w,)
        # --- row coordinate tables ---
        target_row = torch.arange(target_height, dtype=torch.float32, device=device)
        src_row = (target_row + 0.5) * (float(height) / float(target_height)) - 0.5
        src_row_floor = src_row.floor().to(torch.int32)
        src_row_frac = src_row - src_row_floor.float()
        src_row_frac = torch.where(src_row_floor < 0, torch.zeros_like(src_row_frac), src_row_frac)
        src_row_floor = torch.where(src_row_floor < 0, torch.zeros_like(src_row_floor), src_row_floor)
        src_row_frac = torch.where(src_row_floor >= height - 1, torch.ones_like(src_row_frac), src_row_frac)
        src_row_floor = torch.where(
            src_row_floor >= height - 1, torch.full_like(src_row_floor, height - 2), src_row_floor
        )
        weight_bottom = (src_row_frac * 2048 + 0.5).floor().to(torch.int32)
        weight_top = 2048 - weight_bottom  # (target_h,)

        image_uint8 = image.clamp(0, 255).to(torch.uint8)  # (C, H, W)
        image_int32 = image_uint8.to(torch.int32)  # (C, H, W)
        col_left = src_col_floor.long()  # (target_w,)
        col_right = (src_col_floor + 1).long()  # (target_w,)  safe: src_col_floor <= width-2
        row_top = src_row_floor.long()  # (target_h,)
        row_bottom = (src_row_floor + 1).long()  # (target_h,)
        # gather 4 neighbours: (C, target_h, target_w)
        pixel_top_left = image_int32[:, row_top[:, None], col_left[None, :]]
        pixel_top_right = image_int32[:, row_top[:, None], col_right[None, :]]
        pixel_bottom_left = image_int32[:, row_bottom[:, None], col_left[None, :]]
        pixel_bottom_right = image_int32[:, row_bottom[:, None], col_right[None, :]]
        # fixed-point bilinear: weights broadcast over (C, target_h, target_w)
        weight_bottom_3d = weight_bottom.view(1, target_height, 1)
        weight_top_3d = weight_top.view(1, target_height, 1)
        weight_right_3d = weight_right.view(1, 1, target_width)
        weight_left_3d = weight_left.view(1, 1, target_width)
        interp = weight_top_3d * (
            weight_left_3d * pixel_top_left + weight_right_3d * pixel_top_right
        ) + weight_bottom_3d * (weight_left_3d * pixel_bottom_left + weight_right_3d * pixel_bottom_right)
        interp = (interp + (1 << 21)) >> 22
        result = interp.clamp(0, 255).to(torch.uint8)  # (B*C, target_h, target_w)

        return result.view(batch_size, channels, target_height, target_width).to(dtype=image.dtype)

    def _preprocess(
        self,
        images: list["torch.Tensor"],
        do_resize: bool,
        size: SizeDict,
        resample: "tvF.InterpolationMode | int | None",
        do_center_crop: bool,
        crop_size: SizeDict,
        do_rescale: bool,
        rescale_factor: float,
        do_normalize: bool,
        image_mean: float | list[float] | None,
        image_std: float | list[float] | None,
        do_pad: bool | None,
        pad_size: SizeDict | None,
        disable_grouping: bool | None,
        return_tensors: str | TensorType | None,
        **kwargs,
    ) -> BatchFeature:
        if resample is not None and not is_torchdynamo_compiling():
            logger.warning_once("Resampling is not supported in SLANeXt")

        # Group images by size for batched resizing
        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
        resized_images_grouped = {}
        for shape, stacked_images in grouped_images.items():
            if do_resize:
                stacked_images = self._resize(image=stacked_images, size=size)
            resized_images_grouped[shape] = stacked_images
        resized_images = reorder_images(resized_images_grouped, grouped_images_index)

        # Group images by size for further processing
        # Needed in case do_resize is False, or resize returns images with different sizes
        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
        processed_images_grouped = {}
        for shape, stacked_images in grouped_images.items():
            if do_center_crop:
                stacked_images = self.center_crop(stacked_images, crop_size)
            # Fused rescale and normalize
            stacked_images = self.rescale_and_normalize(
                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
            )
            processed_images_grouped[shape] = stacked_images
        processed_images = reorder_images(processed_images_grouped, grouped_images_index)

        if do_pad:
            processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)

        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

    def __init__(self, **kwargs: Unpack[ImagesKwargs]):
        super().__init__(**kwargs)
        self.init_decoder()

    def init_decoder(self):
        """
        Initialize the decoder vocabulary for table structure recognition.

        Builds a character dictionary mapping HTML table structure tokens (e.g., `<thead>`, `<tr>`, `<td>`, colspan/
        rowspan attributes) to integer indices. The dictionary includes special `"sos"` (start-of-sequence) and
        `"eos"` (end-of-sequence) tokens. Merged `<td></td>` tokens are used in place of standalone `<td>` tokens
        when applicable.
        """
        dict_character = [
            "<thead>",
            "</thead>",
            "<tbody>",
            "</tbody>",
            "<tr>",
            "</tr>",
            "<td>",
            "<td",
            ">",
            "</td>",
        ]
        dict_character += [f' colspan="{i + 2}"' for i in range(19)]
        dict_character += [f' rowspan="{i + 2}"' for i in range(19)]

        if "<td></td>" not in dict_character:
            dict_character.append("<td></td>")
        if "<td>" in dict_character:
            dict_character.remove("<td>")

        dict_character = ["sos"] + dict_character + ["eos"]
        self.dict = {char: i for i, char in enumerate(dict_character)}
        self.character = dict_character
        self.td_token = ["<td>", "<td", "<td></td>"]
        self.bos_id = self.dict["sos"]
        self.eos_id = self.dict["eos"]

    def post_process_table_recognition(self, outputs):
        """
        Post-process the raw model outputs to decode the predicted table structure into an HTML token sequence.

        Converts the model's predicted probability distributions over the structure vocabulary into a sequence of
        HTML tokens representing the table structure. The decoded tokens are wrapped with `<html>`, `<body>`, and
        `<table>` tags to form a complete HTML table structure.

        Args:
            outputs ([`SLANeXtForTableRecognitionOutput`]):
                Raw outputs from the SLANeXt model. The `last_hidden_state` field contains the predicted probability
                distributions over the structure vocabulary at each decoding step, with shape
                `(batch_size, max_text_length, num_classes)`.

        Returns:
            `dict`: A dictionary containing:
                - **structure** (`list[str]`): The predicted HTML table structure as a list of tokens, wrapped with
                  `<html>`, `<body>`, and `<table>` tags.
                - **structure_score** (`float`): The mean confidence score across all predicted tokens.
        """
        self.pred = outputs.last_hidden_state
        structure_probs = self.pred[0:1]
        ignored_tokens = [int(self.bos_id), int(self.eos_id)]
        end_idx = int(self.eos_id)

        structure_idx = structure_probs.argmax(dim=2)
        structure_probs = structure_probs.max(dim=2).values

        structure_str_list = []
        batch_size = structure_idx.shape[0]
        for batch_index in range(batch_size):
            structure_list = []
            score_list = []
            for position in range(structure_idx.shape[1]):
                char_idx = int(structure_idx[batch_index, position])
                if position > 0 and char_idx == end_idx:
                    break
                if char_idx in ignored_tokens:
                    continue
                text = self.character[char_idx]
                structure_list.append(text)
                score_list.append(structure_probs[batch_index, position])
            structure_str_list.append(structure_list)
            structure_score = torch.stack(score_list).mean().item()

        structure = ["<html>", "<body>", "<table>"] + structure_str_list[0] + ["</table>", "</body>", "</html>"]
        return {"structure": structure, "structure_score": structure_score}


__all__ = ["SLANeXtImageProcessor"]
