Source code for docp_parsers.objects.pdfobject

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the 'PDF Document' object structure into
            which PDF documents are parsed into for transport and onward
            use.

:Platform:  Linux/Windows | Python 3.11+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

"""

try:
    from ._docbaseobject import _DocBase
    from ._pageobject import PageObject
except ImportError:
    from docp_parsers.objects._docbaseobject import _DocBase
    from docp_parsers.objects._pageobject import PageObject


[docs] class DocPDF(_DocBase): """Container class for storing data parsed from a PDF file.""" def __init__(self): """PDF document object class initialiser.""" super().__init__() self._tags = False self._pages = [PageObject(pageno=0)] # List of PageObjects, offset by 1 to align # the index with page numbers. self._tables = [] # List of extracted table objects. @property def pages(self) -> list[PageObject]: """A list of containing an object for each page in the document. .. tip:: The page number index aligns to the page number in the PDF file. For example, to access the ``PageObject`` for page 42, use:: pages[42] """ return self._pages @property def marked_content(self) -> bool: """Indicate if the document was parsed using marked-content tags. PDF documents can be created with 'marked content' tags. When a PDF document is parsed using tags, as this flag indicates, the parser respects columns and other page formatting schemes. If a multi-column page is parsed without tags, the parser reads straight across the line, thus corrupting the text. """ return self._tags @property def tables(self) -> list: """Accessor to data extracted from a document's tables.""" return self._tables