Source code for inscriptis

r"""Parse HTML content and converts it into a text representation.

Inscriptis provides support for

 - nested HTML tables
 - basic Cascade Style Sheets
 - annotations

The following example provides the text representation of
`<https://www.fhgr.ch>`_.

.. code::

   import urllib.request
   from inscriptis import get_text

   url = 'https://www.fhgr.ch'
   html = urllib.request.urlopen(url).read().decode('utf-8')

   text = get_text(html)

   print(text)

Use the method :meth:`~inscriptis.get_annotated_text` to obtain text and
annotations. The method requires annotation rules as described in annotations_.

.. code::

   import urllib.request
   from inscriptis import get_annotated_text

   url = "https://www.fhgr.ch"
   html = urllib.request.urlopen(url).read().decode('utf-8')

   # annotation rules specify the HTML elements and attributes to annotate.
   rules = {'h1': ['heading'],
            'h2': ['heading'],
            '#class=FactBox': ['fact-box'],
            'i': ['emphasis']}

  output = get_annotated_text(html, ParserConfig(annotation_rules=rules)
  print("Text:", output['text'])
  print("Annotations:", output['label'])

The method returns a dictionary with two keys:

 1. `text` which contains the page's plain text and
 2. `label` with the annotations in JSONL format that is used by annotators
     such as `doccano <https://doccano.herokuapp.com/>`_.

Annotations in the `label` field are returned as a list of triples with
 `start index`, `end index` and `label` as indicated below:

.. code-block:: json

   {"text": "Chur\n\nChur is the capital and largest town of the Swiss canton
             of the Grisons and lies in the Grisonian Rhine Valley.",
    "label": [[0, 4, "heading"], [6, 10, "emphasis"]]}

"""

import re
import lxml.html

from typing import Dict, Optional, Any

from inscriptis.model.config import ParserConfig
from inscriptis.html_engine import Inscriptis

RE_STRIP_XML_DECLARATION = re.compile(r'^<\?xml [^>]+?\?>')


def _get_html_tree(html_content: str) -> Optional[lxml.html.HtmlElement]:
    """Obtain the HTML parse tree for the given HTML content.

    Args:
        html_content: The content to parse.

    Returns:
        The corresponding HTML parse tree.
    """
    html_content = html_content.strip()
    if not html_content:
        return None

    # strip XML declaration, if necessary
    if html_content.startswith('<?xml '):
        html_content = RE_STRIP_XML_DECLARATION.sub('', html_content, count=1)

    return lxml.html.fromstring(html_content)


[docs]def get_text(html_content: str, config: ParserConfig = None) -> str: """Provide a text representation of the given HTML content. Args: html_content (str): The HTML content to convert. config: An optional ParserConfig object. Returns: The text representation of the HTML content. """ html_tree = _get_html_tree(html_content) return Inscriptis(html_tree, config).get_text() if html_tree is not None \ else ''
[docs]def get_annotated_text(html_content: str, config: ParserConfig = None) -> Dict[str, Any]: """Return a dictionary of the extracted text and annotations. Notes: - the text is stored under the key 'text'. - annotations are provided under the key 'label' which contains a list of :class:`Annotation`s. Examples: {"text": "EU rejects German call to boycott British lamb.", " label": [ [0, 2, "strong"], ... ]} {"text": "Peter Blackburn", "label": [ [0, 15, "heading"] ]} Returns: A dictionary of text (key: 'text') and annotations (key: 'label') """ html_tree = _get_html_tree(html_content) if html_tree is None: return {} inscriptis = Inscriptis(html_tree, config) labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()] return {'text': inscriptis.get_text(), 'label': labels}