Source code for docp_parsers.objects.pdfobject

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the 'PDF Document' object structure into
            which PDF documents are parsed into for transport and onward
            use.

:Platform:  Linux/Windows | Python 3.11+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

"""

try:
    from ._docbaseobject import _DocBase
    from ._pageobject import PageObject
except ImportError:
    from docp_parsers.objects._docbaseobject import _DocBase
    from docp_parsers.objects._pageobject import PageObject



[docs]
class DocPDF(_DocBase):
    """Container class for storing data parsed from a PDF file."""

    def __init__(self):
        """PDF document object class initialiser."""
        super().__init__()
        self._tags = False
        self._pages = [PageObject(pageno=0)]    # List of PageObjects, offset by 1 to align
                                                # the index with page numbers.
        self._tables = []                       # List of extracted table objects.

    @property
    def pages(self) -> list[PageObject]:
        """A list of containing an object for each page in the document.

        .. tip::

            The page number index aligns to the page number in the PDF
            file.

            For example, to access the ``PageObject`` for page 42, use::

                pages[42]

       """
        return self._pages

    @property
    def marked_content(self) -> bool:
        """Indicate if the document was parsed using marked-content tags.

        PDF documents can be created with 'marked content' tags. When
        a PDF document is parsed using tags, as this flag indicates, the
        parser respects columns and other page formatting schemes. If a
        multi-column page is parsed without tags, the parser reads
        straight across the line, thus corrupting the text.

        """
        return self._tags

    @property
    def tables(self) -> list:
        """Accessor to data extracted from a document's tables."""
        return self._tables