From df347ab0a8624c9ca8055efd734ce82e7f01474c Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Tue, 5 Aug 2025 11:05:16 +0200 Subject: [PATCH 01/25] chg: improved documentation. --- README.rst | 184 +++++++++++--------- docs/index.rst | 2 +- docs/inscriptis-module-documentation.rst | 58 ------ pyproject.toml | 1 + src/inscriptis/__init__.py | 6 +- src/inscriptis/annotation/output/surface.py | 1 + src/inscriptis/annotation/parser.py | 1 + src/inscriptis/css_profiles.py | 11 +- src/inscriptis/model/canvas/block.py | 1 + src/inscriptis/model/config.py | 46 ++++- src/inscriptis/model/css.py | 1 + src/inscriptis/model/html_element.py | 44 +++-- src/inscriptis/model/tag/__init__.py | 1 + src/inscriptis/model/tag/a_tag.py | 1 + src/inscriptis/model/tag/br_tag.py | 1 + src/inscriptis/model/tag/img_tag.py | 1 + src/inscriptis/model/tag/list_tag.py | 1 + src/inscriptis/model/tag/table_tag.py | 1 + 18 files changed, 191 insertions(+), 171 deletions(-) delete mode 100644 docs/inscriptis-module-documentation.rst diff --git a/README.rst b/README.rst index 2add60b..d0371e8 100644 --- a/README.rst +++ b/README.rst @@ -6,10 +6,6 @@ inscriptis -- HTML to text conversion library, command line client and Web servi :target: https://badge.fury.io/py/inscriptis :alt: Supported python versions -.. image:: https://api.codeclimate.com/v1/badges/f8ed73f8a764f2bc4eba/maintainability - :target: https://codeclimate.com/github/weblyzard/inscriptis/maintainability - :alt: Maintainability - .. image:: https://codecov.io/gh/weblyzard/inscriptis/branch/master/graph/badge.svg :target: https://codecov.io/gh/weblyzard/inscriptis/ :alt: Coverage @@ -492,41 +488,112 @@ be used within a program: print("Text:", output['text']) print("Annotations:", output['label']) -Fine tuning ------------ +Fine-tuning the HTML rendering +------------------------------ -The following options are available for fine tuning inscriptis' HTML rendering: +Inscriptis provides the ``ParserConfig`` class to fine-tune the HTML rendering +(`see documentation `_). -1. **More rigorous indentation:** call ``inscriptis.get_text()`` with the - parameter ``indentation='extended'`` to also use indentation for tags such as - ``
`` and ```` that do not provide indentation in their standard - definition. This strategy is the default in ``inscript`` and many other - tools such as Lynx. If you do not want extended indentation you can use the - parameter ``indentation='standard'`` instead. +It allows modifying the interpretation of HTML-tags and setting parameters that control the rendering of anchors, +captions, images and links. -2. **Overwriting the default CSS definition:** inscriptis uses CSS definitions - that are maintained in ``inscriptis.css.CSS`` for rendering HTML tags. You can - override these definitions (and therefore change the rendering) as outlined - below: +1. **Firefox-like whitespace handling:** Use the more standard-conform `strict` CSS_PROFILE to render the page. + (``
`` and ```` do not add whitespaces in the `strict` profile. Many text-based browsers such + as Lynx and ``inscript``, add whitespaces per default to reduce the likelihood of words getting glued together). -.. code-block:: python + .. code-block:: python + + from lxml.html import fromstring + + from inscriptis import Inscriptis + from inscriptis.css_profiles import CSS_PROFILES + from inscriptis.model.config import ParserConfig + + # create a ParserConfig that uses the strict CSS rendering profile + css = CSS_PROFILES['strict'] + config = ParserConfig(css=css) + + html_tree = fromstring(html) + parser = Inscriptis(html_tree, config) + text = parser.get_text() + +2. **Firefox-like whitespace handling and fine-tuning of link handling:** Use the strict profile + together with inline links and anchor URLs. + + .. code-block:: python + + from lxml.html import fromstring + + from inscriptis import Inscriptis + from inscriptis.css_profiles import CSS_PROFILES + from inscriptis.model.config import ParserConfig + + # uses the strict CSS rendering profile and fine-tune link handling. + css = CSS_PROFILES['strict'] + config = ParserConfig(css=css, display_links=True, + display_anchors=True) + + html_tree = fromstring(html) + parser = Inscriptis(html_tree, config) + text = parser.get_text() + + +3. **Overwriting the default CSS definition:** inscriptis uses CSS definitions + that are maintained in ``inscriptis.css_profiles_CSS_PROFILES`` for + rendering HTML tags. You can override these definitions (and therefore + change the rendering) as outlined below: + + .. code-block:: python + + from lxml.html import fromstring + + from inscriptis import Inscriptis + from inscriptis.css_profiles import CSS_PROFILES + from inscriptis.html_properties import Display + from inscriptis.model.config import ParserConfig + from inscriptis.model.html_element import HtmlElement + + # Create a custom CSS based on the default style sheet and change the + # rendering of `div` and `span` elements. + css = CSS_PROFILES['strict'].copy() + css['div'] = HtmlElement(display=Display.block, padding=2) + css['span'] = HtmlElement(prefix=' ', suffix=' ') + + html_tree = fromstring(html) + # create a parser using a custom css + config = ParserConfig(css=css) + parser = Inscriptis(html_tree, config) + text = parser.get_text() + +4. **Ignore elements during parsing:** + Overwriting the default CSS profile also allows changing the rendering of selected elements. + The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``. + + .. code-block:: python + + from inscriptis import get_text + from inscriptis.css_profiles import CSS_PROFILES, HtmlElement + from inscriptis.html_properties import Display + from inscriptis.model.config import ParserConfig + + # create a custom CSS based on the default style sheet and change the + # rendering of `div` and `span` elements + css = CSS_PROFILES['strict'].copy() + css['form'] = HtmlElement(display=Display.none) + + # create a parser configuration using a custom css + html = """First line. +
+ User data +
+
+
+ +
""" + config = ParserConfig(css=css) + text = get_text(html, config) + print(text) - from lxml.html import fromstring - from inscriptis.css_profiles import CSS_PROFILES, HtmlElement - from inscriptis.html_properties import Display - from inscriptis.model.config import ParserConfig - - # create a custom CSS based on the default style sheet and change the - # rendering of `div` and `span` elements - css = CSS_PROFILES['strict'].copy() - css['div'] = HtmlElement(display=Display.block, padding=2) - css['span'] = HtmlElement(prefix=' ', suffix=' ') - - html_tree = fromstring(html) - # create a parser using a custom css - config = ParserConfig(css=css) - parser = Inscriptis(html_tree, config) - text = parser.get_text() Custom HTML tag handling @@ -570,55 +637,6 @@ The following code mitigates this problem on Unix systems by manually forcing lx return libc.malloc_trim(0) -Examples -======== - -Strict indentation handling ---------------------------- - -The following example demonstrates modifying ``ParserConfig`` for strict indentation handling. - -.. code-block:: python - - from inscriptis import get_text - from inscriptis.css_profiles import CSS_PROFILES - from inscriptis.model.config import ParserConfig - - config = ParserConfig(css=CSS_PROFILES['strict'].copy()) - text = get_text('first', config) - print(text) - -Ignore elements during parsing ------------------------------- - -Overwriting the default CSS profile also allows changing the rendering of selected elements. -The snippet below, for example, removes forms from the parsed text by setting the definition of the ``form`` tag to ``Display.none``. - -.. code-block:: python - - from inscriptis import get_text - from inscriptis.css_profiles import CSS_PROFILES, HtmlElement - from inscriptis.html_properties import Display - from inscriptis.model.config import ParserConfig - - # create a custom CSS based on the default style sheet and change the - # rendering of `div` and `span` elements - css = CSS_PROFILES['strict'].copy() - css['form'] = HtmlElement(display=Display.none) - - # create a parser configuration using a custom css - html = """First line. -
- User data -
-
-
- -
""" - config = ParserConfig(css=css) - text = get_text(html, config) - print(text) - Citation ======== diff --git a/docs/index.rst b/docs/index.rst index 917b12a..b2f88de 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,7 +16,7 @@ Contents: Documentation benchmarking contributing - inscriptis-module-documentation + api diff --git a/docs/inscriptis-module-documentation.rst b/docs/inscriptis-module-documentation.rst deleted file mode 100644 index b25cb8f..0000000 --- a/docs/inscriptis-module-documentation.rst +++ /dev/null @@ -1,58 +0,0 @@ -=============================== -Inscriptis module documentation -=============================== - -.. automodule:: inscriptis - :members: - -Inscriptis model -================ - -Inscriptis HTML engine ----------------------- -.. automodule:: inscriptis.html_engine - :members: - -Inscriptis HTML properties --------------------------- -.. automodule:: inscriptis.html_properties - :members: - -Inscriptis CSS model --------------------- -.. automodule:: inscriptis.model.css - :members: - -Inscriptis canvas model ------------------------ -.. automodule:: inscriptis.model.canvas - :members: - -.. automodule:: inscriptis.model.canvas.block - :members: - -.. automodule:: inscriptis.model.canvas.prefix - :members: - - - -Inscriptis table model ----------------------- -.. automodule:: inscriptis.model.table - :members: - - -.. _annotations: - -Inscriptis annotations -====================== - -.. automodule:: inscriptis.annotation - :members: - - -Annotation processors ---------------------- - -.. automodule:: inscriptis.annotation.output - :members: diff --git a/pyproject.toml b/pyproject.toml index f60795f..650ec43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ uvicorn = { version = "^0.34.0", optional = true } [tool.poetry.group.dev.dependencies] pytest = "^8.3.5" +myst_parser = "^4.0.1" [build-system] diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py index 4e52312..b5dcda9 100644 --- a/src/inscriptis/__init__.py +++ b/src/inscriptis/__init__.py @@ -7,7 +7,7 @@ - annotations The following example provides the text representation of -``_. +``_ using the method :meth:`inscriptis.get_text`. .. code:: @@ -21,7 +21,7 @@ print(text) -Use the method :meth:`~inscriptis.get_annotated_text` to obtain text and +Use the method :meth:`inscriptis.get_annotated_text` to obtain text and annotations. The method requires annotation rules as described in annotations_. .. code:: @@ -116,7 +116,7 @@ def get_annotated_text( Notes: - the text is stored under the key 'text'. - annotations are provided under the key 'label' which contains a - list of :class:`Annotation`s. + list of Annotations. Examples: {"text": "EU rejects German call to boycott British lamb.", " diff --git a/src/inscriptis/annotation/output/surface.py b/src/inscriptis/annotation/output/surface.py index e4e5252..c80a078 100644 --- a/src/inscriptis/annotation/output/surface.py +++ b/src/inscriptis/annotation/output/surface.py @@ -1,4 +1,5 @@ """Surface Form Annotation Processor.""" + from typing import Dict, Any from inscriptis.annotation.output import AnnotationProcessor diff --git a/src/inscriptis/annotation/parser.py b/src/inscriptis/annotation/parser.py index a246aee..471b8ba 100644 --- a/src/inscriptis/annotation/parser.py +++ b/src/inscriptis/annotation/parser.py @@ -16,6 +16,7 @@ "#class=short-description]": ["description"] } """ + from collections import defaultdict from copy import copy from typing import Dict, Tuple, List diff --git a/src/inscriptis/css_profiles.py b/src/inscriptis/css_profiles.py index 6f680e8..8177ae6 100644 --- a/src/inscriptis/css_profiles.py +++ b/src/inscriptis/css_profiles.py @@ -1,16 +1,16 @@ #!/usr/bin/env python3 # coding: utf-8 -"""Standard CSS profiles shipped with inscriptis. +"""Standard CSS profiles shipped with Inscriptis. -- `strict`: this profile corresponds to the defaults used by Firefox -- `relaxed`: this profile is more suited for text analytics, since it ensures - that whitespaces are inserted between span and div elements - preventing cases where two words stick together. + CSS profiles are used together with + :class:`inscriptis.model.config.ParserConfig` to customize + the HTML to text conversion. """ from inscriptis.html_properties import Display, WhiteSpace from inscriptis.model.html_element import HtmlElement +#: A CSS profile that corresponds to the defaults used by the Firefox Browser STRICT_CSS_PROFILE = { "body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal), "head": HtmlElement(display=Display.none), @@ -54,6 +54,7 @@ "plaintext": HtmlElement(display=Display.block, whitespace=WhiteSpace.pre), } +#: A relaxed CSS profile optimized for content extraction and text analytics. RELAXED_CSS_PROFILE = STRICT_CSS_PROFILE.copy() RELAXED_CSS_PROFILE["div"] = HtmlElement(display=Display.block, padding_inline=2) RELAXED_CSS_PROFILE["span"] = HtmlElement( diff --git a/src/inscriptis/model/canvas/block.py b/src/inscriptis/model/canvas/block.py index 5013233..1e2db8a 100644 --- a/src/inscriptis/model/canvas/block.py +++ b/src/inscriptis/model/canvas/block.py @@ -1,4 +1,5 @@ """Representation of a text block within the HTML canvas.""" + from __future__ import annotations from html import unescape diff --git a/src/inscriptis/model/config.py b/src/inscriptis/model/config.py index fe06897..d13f68f 100644 --- a/src/inscriptis/model/config.py +++ b/src/inscriptis/model/config.py @@ -1,5 +1,7 @@ #!/usr/bin/env python -"""Provide configuration objects for the Inscriptis HTML to text converter.""" +"""This modul provides the ParserConfig class which allows customizing + how Inscriptis converts HTML to text. +""" from __future__ import annotations from copy import deepcopy @@ -15,7 +17,45 @@ class ParserConfig: - """Encapsulate configuration options and CSS definitions.""" + """The ParserConfig class encapsulate + + - CSS definitions (from :mod:`inscriptis.css_profiles` or custom + definitions). + - configuration options for handling images, captions, links, etc. + - annotation rules, if Inscripitis is used for annotating text. + - custom html tag handlers. + + Attributes: + css: An optional custom CSS definition. + display_images: Whether to include image tiles/alt texts. + deduplicate_captions: Whether to deduplicate captions such as image + titles (many newspaper include images and video previews with + identical titles). + display_links: Whether to display link targets + (e.g. `[Python](https://www.python.org)`). + display_anchors: Whether to display anchors (e.g. `[here](#here)`). + annotation_rules: An optional dictionary of annotation rules which + specify tags and attributes to annotation. + table_cell_separator: Separator to use between table cells. + custom_html_tag_handler_mapping: An optional CustomHtmlTagHandler. + + + The following example demonstrates how ParserConfig is used to + + - enable the strict CSS profile and + - prevent links from being shown. + + .. code-block:: Python + + from inscriptis import get_text + from inscriptis.css_profiles import CSS_PROFILES + from inscriptis.model.config import ParserConfig + + css_profile = CSS_PROFILES['strict'].copy() + config = ParserConfig(css=css_profile, display_links=False) + text = get_text('first link', config) + print(text) + """ def __init__( self, @@ -65,7 +105,7 @@ def __init__( def parse_a(self) -> bool: """Indicate whether the text output should contain links or anchors. - Returns + Returns: Whether we need to parse tags. """ return self.display_links or self.display_anchors diff --git a/src/inscriptis/model/css.py b/src/inscriptis/model/css.py index f52c41b..779a20b 100644 --- a/src/inscriptis/model/css.py +++ b/src/inscriptis/model/css.py @@ -5,6 +5,7 @@ - :class:`CssParse` parses CSS specifications and translates them into the corresponding HtmlElements used by Inscriptis for rendering HTML pages. """ + from contextlib import suppress from re import compile as re_compile diff --git a/src/inscriptis/model/html_element.py b/src/inscriptis/model/html_element.py index 5c72bbc..4682e04 100644 --- a/src/inscriptis/model/html_element.py +++ b/src/inscriptis/model/html_element.py @@ -1,4 +1,12 @@ -"""Data structures for handling HTML Elements.""" +""" +The HtmlElement class controls how Inscriptis interprets HTML Elements. + +- The module :mod:`inscriptis.css_profiles` contain CSS profiles which assign + to each standard HTML tag the corresponding :class:`HtmlElement`. +- As for standard GUI browsers, CSS definitions within the parsed HTML modify + the :class:`HtmlElement` and its interpretation. +""" + from typing import Tuple from inscriptis.html_properties import ( @@ -13,23 +21,23 @@ class HtmlElement: """The HtmlElement class stores properties and metadata of HTML elements. Attributes: - - canvas: the canvas to which the HtmlElement writes its content. - - tag: tag name of the given HtmlElement. - - prefix: specifies a prefix that to insert before the tag's content. - - suffix: a suffix to append after the tag's content. - - display: :class:`~inscriptis.html_properties.Display` strategy used for - the content. - - margin_before: vertical margin before the tag's content. - - margin_after: vertical margin after the tag's content. - - padding_inline: horizontal padding_inline before the tag's content. - - whitespace: the :class:`~inscriptis.html_properties.Whitespace` handling - strategy. - - limit_whitespace_affixes: limit printing of whitespace affixes to - elements with `normal` whitespace handling. - - align: the element's horizontal alignment. - - valign: the element's vertical alignment. - - previous_margin_after: the margin after of the previous HtmlElement. - - annotation: annotations associated with the HtmlElement. + canvas: the canvas to which the HtmlElement writes its content. + tag: tag name of the given HtmlElement. + prefix: specifies a prefix that to insert before the tag's content. + suffix: a suffix to append after the tag's content. + display: :class:`~inscriptis.html_properties.Display` strategy used for + the content. + margin_before: vertical margin before the tag's content. + margin_after: vertical margin after the tag's content. + padding_inline: horizontal padding_inline before the tag's content. + whitespace: the :class:`~inscriptis.html_properties.Whitespace` handling + strategy. + limit_whitespace_affixes: limit printing of whitespace affixes to + elements with `normal` whitespace handling. + align: the element's horizontal alignment. + valign: the element's vertical alignment. + previous_margin_after: the margin after of the previous HtmlElement. + annotation: annotations associated with the HtmlElement. """ __slots__ = ( diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py index e877f80..e02d3ae 100644 --- a/src/inscriptis/model/tag/__init__.py +++ b/src/inscriptis/model/tag/__init__.py @@ -1,4 +1,5 @@ """HTML Tag handlers and classes for designing custom HTML tag handlers.""" + from __future__ import annotations from typing import Dict, Callable, NamedTuple diff --git a/src/inscriptis/model/tag/a_tag.py b/src/inscriptis/model/tag/a_tag.py index f435377..4f3e631 100644 --- a/src/inscriptis/model/tag/a_tag.py +++ b/src/inscriptis/model/tag/a_tag.py @@ -1,4 +1,5 @@ """Handle the tag.""" + from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState diff --git a/src/inscriptis/model/tag/br_tag.py b/src/inscriptis/model/tag/br_tag.py index 6a354d1..b10bdd2 100644 --- a/src/inscriptis/model/tag/br_tag.py +++ b/src/inscriptis/model/tag/br_tag.py @@ -1,4 +1,5 @@ """Handle the
tag.""" + from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState diff --git a/src/inscriptis/model/tag/img_tag.py b/src/inscriptis/model/tag/img_tag.py index 51848af..01649ba 100644 --- a/src/inscriptis/model/tag/img_tag.py +++ b/src/inscriptis/model/tag/img_tag.py @@ -1,4 +1,5 @@ """Handle the tag.""" + from typing import Dict from inscriptis.model.html_document_state import HtmlDocumentState diff --git a/src/inscriptis/model/tag/list_tag.py b/src/inscriptis/model/tag/list_tag.py index 08fc553..04e6731 100644 --- a/src/inscriptis/model/tag/list_tag.py +++ b/src/inscriptis/model/tag/list_tag.py @@ -1,4 +1,5 @@ """Handle the
  • ,
      ,