Source code for inscriptis.model.config
#!/usr/bin/env python
"""Configure Inscripits HTML rendering."""
from __future__ import annotations
from copy import deepcopy
from typing import TYPE_CHECKING
from inscriptis.annotation.parser import AnnotationModel
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.attribute import Attribute
DEFAULT_CSS_PROFILE_NAME = "relaxed"
if TYPE_CHECKING:
from inscriptis.model.html_element import HtmlElement
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
[docs]
class ParserConfig:
"""The ParserConfig class allows fine-tuning the HTML rendering.
- CSS definitions (from :mod:`inscriptis.css_profiles` or custom
definitions).
- configuration options for handling images, captions, links, etc.
- annotation rules, if Inscripitis is used for annotating text.
- custom html tag handlers.
Attributes:
css: An optional custom CSS definition.
display_images: Whether to include image tiles/alt texts.
deduplicate_captions: Whether to deduplicate captions such as image
titles (many newspaper include images and video previews with
identical titles).
display_links: Whether to display link targets
(e.g. `[Python](https://www.python.org)`).
display_anchors: Whether to display anchors (e.g. `[here](#here)`).
annotation_rules: An optional dictionary of annotation rules which
specify tags and attributes to annotation.
table_cell_separator: Separator to use between table cells.
custom_html_tag_handler_mapping: An optional CustomHtmlTagHandler.
The following example demonstrates how ParserConfig is used to
- enable the strict CSS profile and
- prevent links from being shown.
.. code-block:: Python
from inscriptis import get_text
from inscriptis.css_profiles import CSS_PROFILES
from inscriptis.model.config import ParserConfig
css_profile = CSS_PROFILES['strict'].copy()
config = ParserConfig(css=css_profile, display_links=False)
text = get_text('fi<span>r</span>st <a href="/first">link</a>', config)
print(text)
"""
def __init__(
self,
css: dict[str, HtmlElement] | None = None,
display_images: bool = False,
deduplicate_captions: bool = False,
display_links: bool = False,
display_anchors: bool = False,
annotation_rules: dict[str, list[str]] | None = None,
table_cell_separator: str = " ",
custom_html_tag_handler_mapping: CustomHtmlTagHandlerMapping = None,
):
"""Create a ParserConfig configuration.
Args:
css: an optional custom CSS definition.
display_images: whether to include image tiles/alt texts.
deduplicate_captions: whether to deduplicate captions such as image
titles (many newspaper include images and video previews with
identical titles).
display_links: whether to display link targets
(e.g. `[Python](https://www.python.org)`).
display_anchors: whether to display anchors (e.g. `[here](#here)`).
annotation_rules: an optional dictionary of annotation rules which
specify tags and attributes to annotation.
table_cell_separator: separator to use between table cells.
custom_html_tag_handler_mapping: an optional CustomHtmlTagHandler
"""
self.display_images = display_images
self.deduplicate_captions = deduplicate_captions
self.display_links = display_links
self.display_anchors = display_anchors
self.css = css or CSS_PROFILES[DEFAULT_CSS_PROFILE_NAME]
self.attribute_handler = Attribute()
self.table_cell_separator = table_cell_separator
self.custom_html_tag_handler_mapping = custom_html_tag_handler_mapping
if annotation_rules:
# ensure that we do not modify the original model or its
# members.
annotation_model = AnnotationModel(deepcopy(self.css), annotation_rules)
# css with annotation support
self.css = annotation_model.css
# attribute handler with annotation support
self.attribute_handler.merge_attribute_map(annotation_model.css_attr)
[docs]
def parse_a(self) -> bool:
"""Indicate whether the text output should contain links or anchors.
Returns:
Whether we need to parse <a> tags.
"""
return self.display_links or self.display_anchors