Source code for inscriptis.html_engine

#!/usr/bin/env python
"""The HTML Engine is responsible for converting HTML to text."""

from __future__ import annotations

from typing import TYPE_CHECKING

from lxml.etree import Comment

from inscriptis.model.config import ParserConfig
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.tag.a_tag import a_end_handler, a_start_handler
from inscriptis.model.tag.br_tag import br_start_handler
from inscriptis.model.tag.img_tag import img_start_handler
from inscriptis.model.tag.list_tag import (
    li_start_handler,
    ol_end_handler,
    ol_start_handler,
    ul_end_handler,
    ul_start_handler,
)
from inscriptis.model.tag.table_tag import (
    table_end_handler,
    table_start_handler,
    td_end_handler,
    td_start_handler,
    tr_start_handler,
)

if TYPE_CHECKING:
    from collections.abc import Callable

    import lxml.html

    from inscriptis.annotation import Annotation
    from inscriptis.model.canvas import Canvas



[docs]
class Inscriptis:
    """Translate an lxml HTML tree to the corresponding text representation.

    Args:
      html_tree: the lxml HTML tree to convert.
      config: an optional ParserConfig configuration object.

    Example::

      from lxml.html import fromstring
      from inscriptis.html_engine import Inscriptis

      html_content = "<html><body><h1>Test</h1></body></html>"

      # create an HTML tree from the HTML content.
      html_tree = fromstring(html_content)

      # transform the HTML tree to text.
      parser = Inscriptis(html_tree)
      text = parser.get_text()

    """

    def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None) -> None:
        # use the default configuration, if no config object is provided
        config = config or ParserConfig()

        # setup start and end tag call tables
        self.start_tag_handler_dict: dict[str, Callable[[HtmlDocumentState, dict], None]] = {
            "table": table_start_handler,
            "tr": tr_start_handler,
            "td": td_start_handler,
            "th": td_start_handler,
            "ul": ul_start_handler,
            "ol": ol_start_handler,
            "li": li_start_handler,
            "br": br_start_handler,
            "a": a_start_handler if config.parse_a() else None,
            "img": img_start_handler if config.display_images else None,
        }
        self.end_tag_handler_dict: dict[str, Callable[[HtmlDocumentState], None]] = {
            "table": table_end_handler,
            "ul": ul_end_handler,
            "ol": ol_end_handler,
            "td": td_end_handler,
            "th": td_end_handler,
            "a": a_end_handler if config.parse_a() else None,
        }

        if config.custom_html_tag_handler_mapping:
            self.start_tag_handler_dict.update(config.custom_html_tag_handler_mapping.start_tag_mapping)
            self.end_tag_handler_dict.update(config.custom_html_tag_handler_mapping.end_tag_mapping)

        # parse the HTML tree
        self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)

    def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
        """Parse the HTML tree.

        Args:
            state: the current HTML document state.
            tree: the HTML tree to parse.

        """
        if isinstance(tree.tag, str):
            state.apply_starttag_layout(tree.tag, tree.attrib)

            if handler := self.start_tag_handler_dict.get(tree.tag):
                handler(state, tree.attrib)
            cur = state.tags[-1]
            cur.canvas.open_tag(cur)

            state.tags[-1].write(tree.text)

            for node in tree:
                self._parse_html_tree(state, node)

            # handle the endtag
            if handler := self.end_tag_handler_dict.get(tree.tag):
                handler(state)
            prev = state.tags.pop()
            prev.canvas.close_tag(prev)

            # write the tail text to the element's container
            state.tags[-1].write(tree.tail)

        elif tree.tag is Comment and tree.tail:
            state.tags[-1].canvas.write(state.tags[-1], tree.tail)

        return state.canvas


[docs]
    def get_text(self) -> str:
        """Return the text extracted from the HTML page."""
        return self.canvas.get_text()



[docs]
    def get_annotations(self) -> list[Annotation]:
        """Return the annotations extracted from the HTML page."""
        return self.canvas.annotations
Source code for inscriptis.html_engine

Navigation

Related Topics