Source code for inscriptis.html_engine

#!/usr/bin/env python
"""The HTML Engine is responsible for converting HTML to text."""

from __future__ import annotations

from typing import TYPE_CHECKING

from lxml.etree import Comment

from inscriptis.model.config import ParserConfig
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.tag.a_tag import a_end_handler, a_start_handler
from inscriptis.model.tag.br_tag import br_start_handler
from inscriptis.model.tag.img_tag import img_start_handler
from inscriptis.model.tag.list_tag import (
    li_start_handler,
    ol_end_handler,
    ol_start_handler,
    ul_end_handler,
    ul_start_handler,
)
from inscriptis.model.tag.table_tag import (
    table_end_handler,
    table_start_handler,
    td_end_handler,
    td_start_handler,
    tr_start_handler,
)

if TYPE_CHECKING:
    from collections.abc import Callable

    import lxml.html

    from inscriptis.annotation import Annotation
    from inscriptis.model.canvas import Canvas


[docs] class Inscriptis: """Translate an lxml HTML tree to the corresponding text representation. Args: html_tree: the lxml HTML tree to convert. config: an optional ParserConfig configuration object. Example:: from lxml.html import fromstring from inscriptis.html_engine import Inscriptis html_content = "<html><body><h1>Test</h1></body></html>" # create an HTML tree from the HTML content. html_tree = fromstring(html_content) # transform the HTML tree to text. parser = Inscriptis(html_tree) text = parser.get_text() """ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None) -> None: # use the default configuration, if no config object is provided config = config or ParserConfig() # setup start and end tag call tables self.start_tag_handler_dict: dict[str, Callable[[HtmlDocumentState, dict], None]] = { "table": table_start_handler, "tr": tr_start_handler, "td": td_start_handler, "th": td_start_handler, "ul": ul_start_handler, "ol": ol_start_handler, "li": li_start_handler, "br": br_start_handler, "a": a_start_handler if config.parse_a() else None, "img": img_start_handler if config.display_images else None, } self.end_tag_handler_dict: dict[str, Callable[[HtmlDocumentState], None]] = { "table": table_end_handler, "ul": ul_end_handler, "ol": ol_end_handler, "td": td_end_handler, "th": td_end_handler, "a": a_end_handler if config.parse_a() else None, } if config.custom_html_tag_handler_mapping: self.start_tag_handler_dict.update(config.custom_html_tag_handler_mapping.start_tag_mapping) self.end_tag_handler_dict.update(config.custom_html_tag_handler_mapping.end_tag_mapping) # parse the HTML tree self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree) def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas: """Parse the HTML tree. Args: state: the current HTML document state. tree: the HTML tree to parse. """ if isinstance(tree.tag, str): state.apply_starttag_layout(tree.tag, tree.attrib) if handler := self.start_tag_handler_dict.get(tree.tag): handler(state, tree.attrib) cur = state.tags[-1] cur.canvas.open_tag(cur) state.tags[-1].write(tree.text) for node in tree: self._parse_html_tree(state, node) # handle the endtag if handler := self.end_tag_handler_dict.get(tree.tag): handler(state) prev = state.tags.pop() prev.canvas.close_tag(prev) # write the tail text to the element's container state.tags[-1].write(tree.tail) elif tree.tag is Comment and tree.tail: state.tags[-1].canvas.write(state.tags[-1], tree.tail) return state.canvas
[docs] def get_text(self) -> str: """Return the text extracted from the HTML page.""" return self.canvas.get_text()
[docs] def get_annotations(self) -> list[Annotation]: """Return the annotations extracted from the HTML page.""" return self.canvas.annotations