Source code for pdftools_toolbox.pdf.content.content_extractor
from __future__ import annotations
import io
from typing import List, Iterator, Tuple, Optional, Any, TYPE_CHECKING, Callable
from ctypes import *
from datetime import datetime
from numbers import Number
from pdftools_toolbox.internal import _lib
from pdftools_toolbox.internal.utils import _string_to_utf16, _utf16_to_string
from pdftools_toolbox.internal.streams import _StreamDescriptor, _NativeStream
from pdftools_toolbox.internal.native_base import _NativeBase
from pdftools_toolbox.internal.native_object import _NativeObject
from collections.abc import Iterable
import pdftools_toolbox.internal
import pdftools_toolbox.pdf.content.content_element
if TYPE_CHECKING:
from pdftools_toolbox.pdf.content.ungrouping_selection import UngroupingSelection
from pdftools_toolbox.pdf.content.content import Content
from pdftools_toolbox.pdf.content.content_element import ContentElement
else:
UngroupingSelection = "pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection"
Content = "pdftools_toolbox.pdf.content.content.Content"
ContentElement = "pdftools_toolbox.pdf.content.content_element.ContentElement"
[docs]
class ContentExtractor(_NativeObject, Iterable):
"""
"""
[docs]
def __init__(self, content: Content):
"""
Create a new content extractor
Args:
content (pdftools_toolbox.pdf.content.content.Content):
the content object of a page or group
Raises:
OSError:
Error reading from the document
pdftools_toolbox.corrupt_error.CorruptError:
The document is corrupt
ValueError:
if the document associated with the `content` object has already been closed
ValueError:
if the document associated with the content has already been closed
ValueError:
if the `content`'s document is an output document
"""
from pdftools_toolbox.pdf.content.content import Content
if not isinstance(content, Content):
raise TypeError(f"Expected type {Content.__name__}, but got {type(content).__name__}.")
_lib.PtxPdfContent_ContentExtractor_New.argtypes = [c_void_p]
_lib.PtxPdfContent_ContentExtractor_New.restype = c_void_p
ret_val = _lib.PtxPdfContent_ContentExtractor_New(content._handle)
if ret_val is None:
_NativeBase._throw_last_error(False)
super()._initialize(ret_val)
@property
def ungrouping(self) -> UngroupingSelection:
"""
Configures the extractor's behavior regarding the selection of groups to be un-grouped.
Default value: :attr:`pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection.NONE` .
Returns:
pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection
Raises:
StateError:
the object has already been closed
"""
from pdftools_toolbox.pdf.content.ungrouping_selection import UngroupingSelection
_lib.PtxPdfContent_ContentExtractor_GetUngrouping.argtypes = [c_void_p]
_lib.PtxPdfContent_ContentExtractor_GetUngrouping.restype = c_int
ret_val = _lib.PtxPdfContent_ContentExtractor_GetUngrouping(self._handle)
if ret_val == 0:
_NativeBase._throw_last_error()
return UngroupingSelection(ret_val)
@ungrouping.setter
def ungrouping(self, val: UngroupingSelection) -> None:
"""
Configures the extractor's behavior regarding the selection of groups to be un-grouped.
Default value: :attr:`pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection.NONE` .
Args:
val (pdftools_toolbox.pdf.content.ungrouping_selection.UngroupingSelection):
property value
Raises:
StateError:
the object has already been closed
"""
from pdftools_toolbox.pdf.content.ungrouping_selection import UngroupingSelection
if not isinstance(val, UngroupingSelection):
raise TypeError(f"Expected type {UngroupingSelection.__name__}, but got {type(val).__name__}.")
_lib.PtxPdfContent_ContentExtractor_SetUngrouping.argtypes = [c_void_p, c_int]
_lib.PtxPdfContent_ContentExtractor_SetUngrouping.restype = c_bool
if not _lib.PtxPdfContent_ContentExtractor_SetUngrouping(self._handle, c_int(val.value)):
_NativeBase._throw_last_error(False)
def __iter__(self) -> ContentExtractorIterator:
_lib.PtxPdfContent_ContentExtractor_GetIterator.argtypes = [c_void_p]
_lib.PtxPdfContent_ContentExtractor_GetIterator.restype = c_void_p
iterator_handle = _lib.PtxPdfContent_ContentExtractor_GetIterator(self._handle)
if iterator_handle is None:
_NativeBase._throw_last_error(False)
return ContentExtractor.ContentExtractorIterator(iterator_handle)
[docs]
class ContentExtractorIterator(_NativeObject):
def __iter__(self) -> ContentExtractor.ContentExtractorIterator:
return self
def __enter__(self) -> ContentExtractor.ContentExtractorIterator:
return self
def __exit__(self, exc_type, exc_value, traceback) -> None:
self.__del__()
def __init__(self, iterator_handle: c_void_p) -> None:
super()._initialize(iterator_handle)
self._current: Optional[ContentElement] = None
def __next__(self) -> ContentElement:
_lib.PtxPdfContent_ContentExtractorIterator_MoveNext.argtypes = [c_void_p]
_lib.PtxPdfContent_ContentExtractorIterator_MoveNext.restype = c_bool
ret_val = _lib.PtxPdfContent_ContentExtractorIterator_MoveNext(self._handle)
if not ret_val:
raise StopIteration
self._current = self._get_value()
return self._current
def _get_value(self) -> ContentElement:
from pdftools_toolbox.pdf.content.content_element import ContentElement
_lib.PtxPdfContent_ContentExtractorIterator_GetValue.argtypes = [c_void_p]
_lib.PtxPdfContent_ContentExtractorIterator_GetValue.restype = c_void_p
ret_val = _lib.PtxPdfContent_ContentExtractorIterator_GetValue(self._handle)
if ret_val is None:
_NativeBase._throw_last_error(False)
return ContentElement._create_dynamic_type(ret_val)
@staticmethod
def _create_dynamic_type(handle):
return ContentExtractor._from_handle(handle)
@classmethod
def _from_handle(cls, handle):
"""
Internal factory method for constructing an instance using an internal handle.
This method creates an instance of the class by bypassing the public constructor.
"""
instance = ContentExtractor.__new__(cls) # Bypass __init__
instance._initialize(handle)
return instance
def _initialize(self, handle):
super()._initialize(handle)