huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/lighton_ocr.md‎
Lines changed: 80 additions & 0 deletions b/‎docs/source/en/model_doc/lighton_ocr.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/configuration_auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/image_processing_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/image_processing_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/modeling_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/modeling_auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/processing_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/tokenization_auto.py‎
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/tokenization_auto.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/lighton_ocr/__init__.py‎
Lines changed: 28 additions & 0 deletions b/‎src/transformers/models/lighton_ocr/__init__.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/transformers/models/lighton_ocr/configuration_lighton_ocr.py‎
Lines changed: 128 additions & 0 deletions b/‎src/transformers/models/lighton_ocr/configuration_lighton_ocr.py‎
Lines changed: 128 additions & 0 deletions
@@ -1115,6 +1115,8 @@
         title: LayoutXLM
       - local: model_doc/lfm2_vl
         title: LFM2-VL
+      - local: model_doc/lighton_ocr
+        title: LightOnOcr
       - local: model_doc/lilt
         title: LiLT
       - local: model_doc/llama4
 
@@ -0,0 +1,80 @@
+<!--Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2026-01-14.*
+
+# LightOnOcr
+
+
+**LightOnOcr** is a compact, end-to-end vision–language model for Optical Character Recognition (OCR) and document understanding. It achieves state-of-the-art accuracy in its weight class while being several times faster and cheaper than larger general-purpose VLMs.
+
+📝 **[Read the full blog post](https://huggingface.co/blog/lightonai/lightonocr/)** | 📓 **[Finetuning notebook](https://colab.research.google.com/drive/1WjbsFJZ4vOAAlKtcCauFLn_evo5UBRNa?usp=sharing)**
+
+**Model Overview**
+
+LightOnOcr combines a Vision Transformer encoder (Pixtral-based) with a lightweight text decoder (Qwen3-based) distilled from high-quality open VLMs. It is optimized for document parsing tasks, producing accurate, layout-aware text extraction from high-resolution pages.
+
+## Usage
+
+```python
+import torch
+
+from transformers import LightOnOcrForConditionalGeneration, LightOnOcrProcessor
+
+
+device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float32 if device == "mps" else torch.bfloat16
+
+model = LightOnOcrForConditionalGeneration.from_pretrained("lightonai/LightOnOCR-1B-1025", dtype=dtype).to(
+    device
+)
+processor = LightOnOcrProcessor.from_pretrained("lightonai/LightOnOCR-1B-1025")
+
+url = "https://huggingface.co/datasets/hf-internal-testing/fixtures_ocr/resolve/main/SROIE-receipt.jpeg"
+
+conversation = [{"role": "user", "content": [{"type": "image", "url": url}]}]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+)
+inputs = {k: v.to(device=device, dtype=dtype) if v.is_floating_point() else v.to(device) for k, v in inputs.items()}
+
+output_ids = model.generate(**inputs, max_new_tokens=1024)
+generated_ids = output_ids[0, inputs["input_ids"].shape[1] :]
+output_text = processor.decode(generated_ids, skip_special_tokens=True)
+print(output_text)
+```
+
+## LightOnOcrConfig
+
+[[autodoc]] LightOnOcrConfig
+
+## LightOnOcrProcessor
+
+[[autodoc]] LightOnOcrProcessor
+    - __call__
+
+## LightOnOcrModel
+
+[[autodoc]] LightOnOcrModel
+    - forward
+
+## LightOnOcrForConditionalGeneration
+
+[[autodoc]] LightOnOcrForConditionalGeneration
+    - forward
@@ -241,6 +241,7 @@
         ("lfm2_moe", "Lfm2MoeConfig"),
         ("lfm2_vl", "Lfm2VlConfig"),
         ("lightglue", "LightGlueConfig"),
+        ("lighton_ocr", "LightOnOcrConfig"),
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
         ("llama4", "Llama4Config"),
@@ -705,6 +706,7 @@
         ("lfm2_moe", "Lfm2Moe"),
         ("lfm2_vl", "Lfm2Vl"),
         ("lightglue", "LightGlue"),
+        ("lighton_ocr", "LightOnOcr"),
         ("lilt", "LiLT"),
         ("llama", "LLaMA"),
         ("llama2", "Llama2"),
 
@@ -130,6 +130,7 @@
             ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
             ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
             ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")),
+            ("lighton_ocr", ("PixtralImageProcessor", "PixtralImageProcessorFast")),
             ("llama4", (None, "Llama4ImageProcessorFast")),
             ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")),
             ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")),
 
@@ -240,6 +240,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("lfm2_moe", "Lfm2MoeModel"),
         ("lfm2_vl", "Lfm2VlModel"),
         ("lightglue", "LightGlueForKeypointMatching"),
+        ("lighton_ocr", "LightOnOcrModel"),
         ("lilt", "LiltModel"),
         ("llama", "LlamaModel"),
         ("llama4", "Llama4ForConditionalGeneration"),
@@ -924,6 +925,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
         ("lfm2_vl", "Lfm2VlForConditionalGeneration"),
+        ("lighton_ocr", "LightOnOcrForConditionalGeneration"),
         ("llama4", "Llama4ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
 
@@ -101,6 +101,7 @@
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("layoutxlm", "LayoutXLMProcessor"),
         ("lfm2_vl", "Lfm2VlProcessor"),
+        ("lighton_ocr", "LightOnOcrProcessor"),
         ("llama4", "Llama4Processor"),
         ("llava", "LlavaProcessor"),
         ("llava_next", "LlavaNextProcessor"),
 
@@ -169,6 +169,7 @@
         ("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None),
         ("layoutxlm", "LayoutXLMTokenizer" if is_tokenizers_available() else None),
         ("led", "LEDTokenizer" if is_tokenizers_available() else None),
+        ("lighton_ocr", "Qwen2TokenizerFast" if is_tokenizers_available() else None),
         ("lilt", "RobertaTokenizer" if is_tokenizers_available() else None),
         ("longformer", "RobertaTokenizer" if is_tokenizers_available() else None),
         ("longt5", "T5Tokenizer" if is_tokenizers_available() else None),
 
@@ -0,0 +1,28 @@
+# Copyright 2026 The LightOn Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_lighton_ocr import *
+    from .modeling_lighton_ocr import *
+    from .processing_lighton_ocr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
@@ -0,0 +1,128 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/lighton_ocr/modular_lighton_ocr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_lighton_ocr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2026 The LightOn Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+from ...configuration_utils import PretrainedConfig
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+class LightOnOcrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LightOnOcrForConditionalGeneration`]. It is used to instantiate a
+    LightOnOcr model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will yield
+    a similar configuration to that of the LightOnOcr [lightonocr-hf/lightonocr-9b](https://huggingface.co/lightonocr-hf/lightonocr-9b) architecture.
+
+    Args:
+        spatial_merge_size (`int`, *optional*, defaults to 2):
+            The size of spatial merging for image patches.
+        image_token_id (`int`, *optional*, defaults to 151655):
+            The id of the image token in the vocabulary.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied.
+        vision_config (`dict` or `LightOnOcrVisionConfig`, *optional*):
+            Custom vision configuration or dictionary with vision configuration values.
+        text_config (`dict` or `LightOnOcrTextConfig`, *optional*):
+            Custom text configuration or dictionary with text configuration values.
+
+    Example:
+
+    ```python
+    >>> from transformers import LightOnOcrConfig, LightOnOcrForConditionalGeneration
+
+    >>> # Initializing a LightOnOcr configuration
+    >>> configuration = LightOnOcrConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = LightOnOcrForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "lighton_ocr"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        spatial_merge_size: int = 2,
+        image_token_id: int = 151655,
+        tie_word_embeddings: bool = True,
+        vision_config: dict[str, Any] | None = None,
+        text_config: dict[str, Any] | None = None,
+        **kwargs,
+    ):
+        self.spatial_merge_size = spatial_merge_size
+        self.image_token_id = image_token_id
+        self.tie_word_embeddings = tie_word_embeddings
+
+        if vision_config is None:
+            self.vision_config = CONFIG_MAPPING["pixtral"](
+                attention_dropout=0,
+                head_dim=64,
+                hidden_act="silu",
+                hidden_size=1024,
+                image_size=1540,
+                initializer_range=0.02,
+                intermediate_size=4096,
+                model_type="pixtral",
+                num_attention_heads=16,
+                num_channels=3,
+                num_hidden_layers=24,
+                patch_size=14,
+                rope_theta=10000,
+            )
+        elif isinstance(vision_config, PretrainedConfig):
+            self.vision_config = vision_config
+        else:
+            vision_config["model_type"] = vision_config.get("model_type", "pixtral")
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+
+        if text_config is None:
+            self.text_config = CONFIG_MAPPING["qwen3"](
+                attention_dropout=0,
+                head_dim=128,
+                hidden_act="silu",
+                hidden_size=1024,
+                initializer_range=0.02,
+                intermediate_size=3072,
+                max_position_embeddings=40960,
+                num_attention_heads=16,
+                num_hidden_layers=28,
+                num_key_value_heads=8,
+                rms_norm_eps=1e-6,
+                rope_theta=1000000,
+                sliding_window=None,
+                use_cache=True,
+                vocab_size=151936,
+            )
+        elif isinstance(text_config, PretrainedConfig):
+            self.text_config = text_config
+        else:
+            text_config["model_type"] = text_config.get("model_type", "qwen3")
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["LightOnOcrConfig"]