Source code for inscriptis.model.canvas

#!/usr/bin/env python
# encoding: utf-8

"""Classes used for rendering (parts) of the canvas.

Every parsed :class:`~inscriptis.model.html_element.HtmlElement` writes its
textual content to the canvas which is managed by the following three classes:

  - :class:`Canvas` provides the drawing board on which the HTML page is
    serialized and annotations are recorded.
  - :class:`~inscriptis.model.canvas.block.Block` contains the current line to
    which text is written.
  - :class:`~inscriptis.model.canvas.prefix.Prefix` handles indentation
    and bullets that prefix a line.
"""

from inscriptis.annotation import Annotation
from inscriptis.html_properties import WhiteSpace, Display
from inscriptis.model.canvas.block import Block
from inscriptis.model.html_element import HtmlElement
from inscriptis.model.canvas.prefix import Prefix


[docs]class Canvas: r"""The text Canvas on which Inscriptis writes the HTML page. Attributes: margin: the current margin to the previous block (this is required to ensure that the `margin_after` and `margin_before` constraints of HTML block elements are met). current_block: A :class:`~inscriptis.model.canvas.block.Block` which merges the input text into a block (i.e., line). blocks: a list of strings containing the completed blocks (i.e., text lines). Each block spawns at least one line. annotations: the list of recorded :class:`~inscriptis.annotation.Annotation`\s. _open_annotations: a map of open tags that contain annotations. """ __slots__ = ('annotations', 'blocks', 'current_block', '_open_annotations', 'margin') def __init__(self): self.margin = 1000 # margin to the previous block self.current_block = Block(0, Prefix()) self.blocks = [] self.annotations = [] self._open_annotations = {}
[docs] def open_tag(self, tag: HtmlElement) -> None: """Register that a tag is opened. Args: tag: the tag to open. """ if tag.annotation: self._open_annotations[tag] = self.current_block.idx if tag.display == Display.block: self.open_block(tag)
[docs] def open_block(self, tag: HtmlElement): """Open an HTML block element.""" # write missing bullets, if no content has been written if not self._flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() self.current_block.prefix.register_prefix(tag.padding_inline, tag.list_bullet) # write the block margin required_margin = max(tag.previous_margin_after, tag.margin_before) if required_margin > self.margin: required_newlines = required_margin - self.margin self.current_block.idx += required_newlines self.blocks.append('\n' * (required_newlines - 1)) self.margin = required_margin
[docs] def write_unconsumed_bullet(self): """Write unconsumed bullets to the blocks list.""" bullet = self.current_block.prefix.unconsumed_bullet if bullet: self.blocks.append(bullet) self.current_block.idx += len(bullet) self.current_block = self.current_block.new_block() self.margin = 0
[docs] def write(self, tag: HtmlElement, text: str, whitespace: WhiteSpace = None) -> None: """Write the given text to the current block.""" self.current_block.merge(text, whitespace or tag.whitespace)
[docs] def close_tag(self, tag: HtmlElement) -> None: """Register that the given tag tag is closed. Args: tag: the tag to close. """ if tag.display == Display.block: # write missing bullets, if no content has been written so far. if not self._flush_inline() and tag.list_bullet: self.write_unconsumed_bullet() self.current_block.prefix.remove_last_prefix() self.close_block(tag) if tag in self._open_annotations: start_idx = self._open_annotations.pop(tag) # do not record annotations with no content if start_idx == self.current_block.idx: return for annotation in tag.annotation: self.annotations.append( Annotation(start_idx, self.current_block.idx, annotation))
[docs] def close_block(self, tag: HtmlElement): """Close the given HtmlElement by writing its bottom margin. Args: tag: the HTML Block element to close """ if tag.margin_after > self.margin: required_newlines = tag.margin_after - self.margin self.current_block.idx += required_newlines self.blocks.append('\n' * (required_newlines - 1)) self.margin = tag.margin_after
def write_newline(self): if not self._flush_inline(): self.blocks.append('') self.current_block = self.current_block.new_block()
[docs] def get_text(self) -> str: """Provide a text representation of the Canvas.""" self._flush_inline() return '\n'.join(self.blocks)
def _flush_inline(self) -> bool: """Attempt to flush the content in self.current_block into a new block. Notes: - If self.current_block does not contain any content (or only whitespaces) no changes are made. - Otherwise the content of current_block is added to blocks and a new current_block is initialized. Returns: True if the attempt was successful, False otherwise. """ if not self.current_block.is_empty(): self.blocks.append(self.current_block.content) self.current_block = self.current_block.new_block() self.margin = 0 return True return False @property def left_margin(self) -> int: """Return the length of the current line's left margin.""" return self.current_block.prefix.current_padding