Source code for pdftools_toolbox.pdf.content.content_extractor

from __future__ import annotations
import io
from typing import List, Iterator, Tuple, Optional, Any, TYPE_CHECKING, Callable
from ctypes import *
from datetime import datetime
from numbers import Number
from pdftools_toolbox.internal import _lib
from pdftools_toolbox.internal.utils import _string_to_utf16, _utf16_to_string
from pdftools_toolbox.internal.streams import _StreamDescriptor, _NativeStream
from pdftools_toolbox.internal.native_base import _NativeBase
from pdftools_toolbox.internal.native_object import _NativeObject
from collections.abc import Iterable

import pdftools_toolbox.internal
import pdftools_toolbox.pdf.content.content_element

if TYPE_CHECKING:
    from pdftools_toolbox.pdf.content.ungrouping_selection import UngroupingSelection
    from pdftools_toolbox.pdf.content.content import Content
    from pdftools_toolbox.pdf.content.content_element import ContentElement

else:
    UngroupingSelection = "pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection"
    Content = "pdftools_toolbox.pdf.content.content.Content"
    ContentElement = "pdftools_toolbox.pdf.content.content_element.ContentElement"


[docs] class ContentExtractor(_NativeObject, Iterable): """ """
[docs] def __init__(self, content: Content): """ Create a new content extractor Args: content (pdftools_toolbox.pdf.content.content.Content): the content object of a page or group Raises: OSError: Error reading from the document pdftools_toolbox.corrupt_error.CorruptError: The document is corrupt ValueError: if the document associated with the `content` object has already been closed ValueError: if the document associated with the content has already been closed ValueError: if the `content`'s document is an output document """ from pdftools_toolbox.pdf.content.content import Content if not isinstance(content, Content): raise TypeError(f"Expected type {Content.__name__}, but got {type(content).__name__}.") _lib.PtxPdfContent_ContentExtractor_New.argtypes = [c_void_p] _lib.PtxPdfContent_ContentExtractor_New.restype = c_void_p ret_val = _lib.PtxPdfContent_ContentExtractor_New(content._handle) if ret_val is None: _NativeBase._throw_last_error(False) super()._initialize(ret_val)
@property def ungrouping(self) -> UngroupingSelection: """ Configures the extractor's behavior regarding the selection of groups to be un-grouped. Default value: :attr:`pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection.NONE` . Returns: pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection Raises: StateError: the object has already been closed """ from pdftools_toolbox.pdf.content.ungrouping_selection import UngroupingSelection _lib.PtxPdfContent_ContentExtractor_GetUngrouping.argtypes = [c_void_p] _lib.PtxPdfContent_ContentExtractor_GetUngrouping.restype = c_int ret_val = _lib.PtxPdfContent_ContentExtractor_GetUngrouping(self._handle) if ret_val == 0: _NativeBase._throw_last_error() return UngroupingSelection(ret_val) @ungrouping.setter def ungrouping(self, val: UngroupingSelection) -> None: """ Configures the extractor's behavior regarding the selection of groups to be un-grouped. Default value: :attr:`pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection.NONE` . Args: val (pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection): property value Raises: StateError: the object has already been closed """ from pdftools_toolbox.pdf.content.ungrouping_selection import UngroupingSelection if not isinstance(val, UngroupingSelection): raise TypeError(f"Expected type {UngroupingSelection.__name__}, but got {type(val).__name__}.") _lib.PtxPdfContent_ContentExtractor_SetUngrouping.argtypes = [c_void_p, c_int] _lib.PtxPdfContent_ContentExtractor_SetUngrouping.restype = c_bool if not _lib.PtxPdfContent_ContentExtractor_SetUngrouping(self._handle, c_int(val.value)): _NativeBase._throw_last_error(False) def __iter__(self) -> ContentExtractorIterator: _lib.PtxPdfContent_ContentExtractor_GetIterator.argtypes = [c_void_p] _lib.PtxPdfContent_ContentExtractor_GetIterator.restype = c_void_p iterator_handle = _lib.PtxPdfContent_ContentExtractor_GetIterator(self._handle) if iterator_handle is None: _NativeBase._throw_last_error(False) return ContentExtractor.ContentExtractorIterator(iterator_handle)
[docs] class ContentExtractorIterator(_NativeObject): def __iter__(self) -> ContentExtractor.ContentExtractorIterator: return self def __enter__(self) -> ContentExtractor.ContentExtractorIterator: return self def __exit__(self, exc_type, exc_value, traceback) -> None: self.__del__() def __init__(self, iterator_handle: c_void_p) -> None: super()._initialize(iterator_handle) self._current: Optional[ContentElement] = None def __next__(self) -> ContentElement: _lib.PtxPdfContent_ContentExtractorIterator_MoveNext.argtypes = [c_void_p] _lib.PtxPdfContent_ContentExtractorIterator_MoveNext.restype = c_bool ret_val = _lib.PtxPdfContent_ContentExtractorIterator_MoveNext(self._handle) if not ret_val: raise StopIteration self._current = self._get_value() return self._current def _get_value(self) -> ContentElement: from pdftools_toolbox.pdf.content.content_element import ContentElement _lib.PtxPdfContent_ContentExtractorIterator_GetValue.argtypes = [c_void_p] _lib.PtxPdfContent_ContentExtractorIterator_GetValue.restype = c_void_p ret_val = _lib.PtxPdfContent_ContentExtractorIterator_GetValue(self._handle) if ret_val is None: _NativeBase._throw_last_error(False) return ContentElement._create_dynamic_type(ret_val)
@staticmethod def _create_dynamic_type(handle): return ContentExtractor._from_handle(handle) @classmethod def _from_handle(cls, handle): """ Internal factory method for constructing an instance using an internal handle. This method creates an instance of the class by bypassing the public constructor. """ instance = ContentExtractor.__new__(cls) # Bypass __init__ instance._initialize(handle) return instance def _initialize(self, handle): super()._initialize(handle)