Skip to content

Extractors Reference

PDF Extractor

PdfExtractor

Native PDF text extraction using pypdfium2.

unifex.pdf.PdfExtractor

Bases: BaseExtractor

Extract text and metadata from PDF files using pypdfium2.

Source code in unifex/pdf/pdf.py
class PdfExtractor(BaseExtractor):
    """Extract text and metadata from PDF files using pypdfium2."""

    def __init__(
        self,
        path: Path | str,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
        character_merger: CharacterMerger | None = None,
    ) -> None:
        super().__init__(path, output_unit)
        self._pdf = pdfium.PdfDocument(self.path)
        self._merger = character_merger if character_merger is not None else BasicLineMerger()
        self._lock = threading.Lock()

    def get_page_count(self) -> int:
        return len(self._pdf)

    def extract_page(
        self,
        page: int,
        table_options: dict[str, Any] | None = None,
    ) -> PageExtractionResult:
        """Extract a single page by number (0-indexed).

        Thread-safe: uses internal lock for parallel access.

        Args:
            page: Page number (0-indexed).
            table_options: Optional dict of tabula options for table extraction.
                If provided, tables will be extracted and added to Page.tables.
                Common options: lattice, stream, columns, area, guess, multiple_tables.
        """
        try:
            with self._lock:
                pdf_page = self._pdf[page]
                width, height = pdf_page.get_size()
                text_blocks = self._extract_text_blocks(pdf_page, height)

            tables: list[Table] = []
            if table_options is not None:
                tables = self._extract_tables_for_page(page, table_options)

            result_page = Page(
                page=page,
                width=width,
                height=height,
                texts=text_blocks,
                tables=tables,
            )
            # Convert from native POINTS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.POINTS)
            return PageExtractionResult(page=result_page, success=True)
        except Exception as e:
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        metadata_dict = {}
        try:
            for key in ["Title", "Author", "Creator", "Producer", "CreationDate", "ModDate"]:
                val = self._pdf.get_metadata_value(key)
                if val:
                    metadata_dict[key.lower()] = val
        except (KeyError, ValueError, pdfium.PdfiumError) as e:
            logger.warning("Failed to extract PDF metadata: %s", e)

        return ExtractorMetadata(
            extractor_type=ExtractorType.PDF,
            title=metadata_dict.get("title"),
            author=metadata_dict.get("author"),
            creator=metadata_dict.get("creator"),
            producer=metadata_dict.get("producer"),
            creation_date=metadata_dict.get("creationdate"),
            modification_date=metadata_dict.get("moddate"),
        )

    def close(self) -> None:
        self._pdf.close()

    def _extract_text_blocks(self, page: pdfium.PdfPage, page_height: float) -> list[TextBlock]:
        textpage = page.get_textpage()
        char_count = textpage.count_chars()
        if char_count == 0:
            return []

        # Batch text extraction (206x faster than per-char)
        all_text = textpage.get_text_range(0, char_count)

        # Check rotation support once, not per character
        has_rotation = hasattr(textpage, "get_char_rotation")

        chars: list[CharInfo] = []
        for i in range(char_count):
            bbox = textpage.get_charbox(i)
            rotation = textpage.get_char_rotation(i) if has_rotation else 0
            chars.append(CharInfo(char=all_text[i], bbox=bbox, rotation=rotation, index=i))

        return self._merger.merge(chars, textpage, page_height)

    def extract_tables(
        self,
        pages: Sequence[int] | None = None,
        table_options: dict[str, Any] | None = None,
    ) -> list[Table]:
        """Extract tables from PDF pages using tabula.

        Args:
            pages: Sequence of page numbers to extract (0-indexed).
                   If None, extracts from all pages.
            table_options: Dict of tabula options. Common options:
                - lattice: bool - Use lattice mode (tables with cell borders)
                - stream: bool - Use stream mode (tables without borders)
                - columns: list[float] - Column x-coordinates for splitting
                - area: tuple[float, float, float, float] - (top, left, bottom, right)
                - guess: bool - Guess table areas automatically
                - multiple_tables: bool - Extract multiple tables per page
                - pandas_options: dict - Options for pandas

        Returns:
            List of Table objects with page field indicating source page.
        """
        if pages is None:
            pages = range(self.get_page_count())

        options = table_options or {}
        all_tables: list[Table] = []

        for page_num in pages:
            page_tables = self._extract_tables_for_page(page_num, options)
            all_tables.extend(page_tables)

        return all_tables

    def _extract_tables_for_page(
        self,
        page: int,
        options: dict[str, Any],
    ) -> list[Table]:
        """Extract tables from a single page using tabula.

        Args:
            page: Page number (0-indexed).
            options: Tabula options dict.

        Returns:
            List of Table objects for this page.
        """
        try:
            import tabula
        except ImportError as e:
            raise ImportError(
                "tabula-py is required for table extraction. "
                "Install with: pip install 'unifex[tables]'"
            ) from e

        tabula_opts = self._build_tabula_options(page, options)
        dfs = tabula.read_pdf(str(self.path), **tabula_opts)

        return [self._dataframe_to_table(df, page) for df in dfs if not df.empty]

    def _build_tabula_options(self, page: int, options: dict[str, Any]) -> dict[str, Any]:
        """Build tabula options dict from user options."""
        # Tabula uses 1-indexed pages
        tabula_opts: dict[str, Any] = {
            "pages": page + 1,
            "multiple_tables": options.get("multiple_tables", True),
            "guess": options.get("guess", True),
        }

        # Copy optional settings
        for key in ("lattice", "stream", "columns", "area", "pandas_options"):
            if options.get(key):
                tabula_opts[key] = options[key]

        return tabula_opts

    def _dataframe_to_table(self, df: Any, page: int) -> Table:
        """Convert a pandas DataFrame to a Table model."""
        cells: list[TableCell] = []
        row_count = len(df)
        col_count = len(df.columns)

        # Add header row (column names)
        for col_idx, col_name in enumerate(df.columns):
            cell_text = str(col_name) if col_name is not None else ""
            cells.append(TableCell(text=cell_text, row=0, col=col_idx))

        # Add data rows
        for row_idx, row in enumerate(df.itertuples(index=False), start=1):
            for col_idx, value in enumerate(row):
                cell_text = str(value) if value is not None and str(value) != "nan" else ""
                cells.append(TableCell(text=cell_text, row=row_idx, col=col_idx))

        return Table(
            page=page,
            cells=cells,
            row_count=row_count + 1,  # +1 for header row
            col_count=col_count,
        )

extract_page

extract_page(
    page: int, table_options: dict[str, Any] | None = None
) -> PageExtractionResult

Extract a single page by number (0-indexed).

Thread-safe: uses internal lock for parallel access.

Parameters:

Name Type Description Default
page int

Page number (0-indexed).

required
table_options dict[str, Any] | None

Optional dict of tabula options for table extraction. If provided, tables will be extracted and added to Page.tables. Common options: lattice, stream, columns, area, guess, multiple_tables.

None
Source code in unifex/pdf/pdf.py
def extract_page(
    self,
    page: int,
    table_options: dict[str, Any] | None = None,
) -> PageExtractionResult:
    """Extract a single page by number (0-indexed).

    Thread-safe: uses internal lock for parallel access.

    Args:
        page: Page number (0-indexed).
        table_options: Optional dict of tabula options for table extraction.
            If provided, tables will be extracted and added to Page.tables.
            Common options: lattice, stream, columns, area, guess, multiple_tables.
    """
    try:
        with self._lock:
            pdf_page = self._pdf[page]
            width, height = pdf_page.get_size()
            text_blocks = self._extract_text_blocks(pdf_page, height)

        tables: list[Table] = []
        if table_options is not None:
            tables = self._extract_tables_for_page(page, table_options)

        result_page = Page(
            page=page,
            width=width,
            height=height,
            texts=text_blocks,
            tables=tables,
        )
        # Convert from native POINTS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.POINTS)
        return PageExtractionResult(page=result_page, success=True)
    except Exception as e:
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

extract_tables

extract_tables(
    pages: Sequence[int] | None = None,
    table_options: dict[str, Any] | None = None,
) -> list[Table]

Extract tables from PDF pages using tabula.

Parameters:

Name Type Description Default
pages Sequence[int] | None

Sequence of page numbers to extract (0-indexed). If None, extracts from all pages.

None
table_options dict[str, Any] | None

Dict of tabula options. Common options: - lattice: bool - Use lattice mode (tables with cell borders) - stream: bool - Use stream mode (tables without borders) - columns: list[float] - Column x-coordinates for splitting - area: tuple[float, float, float, float] - (top, left, bottom, right) - guess: bool - Guess table areas automatically - multiple_tables: bool - Extract multiple tables per page - pandas_options: dict - Options for pandas

None

Returns:

Type Description
list[Table]

List of Table objects with page field indicating source page.

Source code in unifex/pdf/pdf.py
def extract_tables(
    self,
    pages: Sequence[int] | None = None,
    table_options: dict[str, Any] | None = None,
) -> list[Table]:
    """Extract tables from PDF pages using tabula.

    Args:
        pages: Sequence of page numbers to extract (0-indexed).
               If None, extracts from all pages.
        table_options: Dict of tabula options. Common options:
            - lattice: bool - Use lattice mode (tables with cell borders)
            - stream: bool - Use stream mode (tables without borders)
            - columns: list[float] - Column x-coordinates for splitting
            - area: tuple[float, float, float, float] - (top, left, bottom, right)
            - guess: bool - Guess table areas automatically
            - multiple_tables: bool - Extract multiple tables per page
            - pandas_options: dict - Options for pandas

    Returns:
        List of Table objects with page field indicating source page.
    """
    if pages is None:
        pages = range(self.get_page_count())

    options = table_options or {}
    all_tables: list[Table] = []

    for page_num in pages:
        page_tables = self._extract_tables_for_page(page_num, options)
        all_tables.extend(page_tables)

    return all_tables

Local OCR Extractors

EasyOcrExtractor

OCR using EasyOCR library.

unifex.ocr.extractors.easy_ocr.EasyOcrExtractor

Bases: BaseExtractor

Extract text from images or PDFs using EasyOCR.

Composes ImageLoader for image handling, EasyOCR for OCR processing, and EasyOCRAdapter for result conversion.

Source code in unifex/ocr/extractors/easy_ocr.py
class EasyOcrExtractor(BaseExtractor):
    """Extract text from images or PDFs using EasyOCR.

    Composes ImageLoader for image handling, EasyOCR for OCR processing,
    and EasyOCRAdapter for result conversion.
    """

    def __init__(
        self,
        path: Path | str,
        languages: list[str] | None = None,
        gpu: bool = False,
        dpi: int = 200,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize EasyOCR extractor.

        Args:
            path: Path to the image or PDF file (Path object or string).
            languages: List of language codes for OCR. Defaults to ["en"].
            gpu: Whether to use GPU acceleration.
            dpi: DPI for PDF-to-image conversion. Default 200.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_easyocr_installed()
        super().__init__(path, output_unit)
        self.languages = languages or ["en"]
        self.gpu = gpu
        self.dpi = dpi

        # Compose components
        self._images = ImageLoader(self.path, dpi)
        self._adapter = EasyOCRAdapter()

    def get_page_count(self) -> int:
        """Return number of pages/images loaded."""
        return self._images.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract text from a single image/page."""
        import numpy as np

        try:
            img = self._images.get_page(page)
            width, height = img.size

            # Run OCR pipeline
            reader = get_reader(self.languages, self.gpu)
            results = reader.readtext(np.array(img))
            text_blocks = self._adapter.convert_result(results)

            result_page = Page(
                page=page,
                width=float(width),
                height=float(height),
                texts=text_blocks,
            )

            # Convert from native PIXELS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
            return PageExtractionResult(page=result_page, success=True)

        except Exception as e:
            logger.warning("Failed to extract page %d: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        """Return extractor metadata."""
        extra = {"ocr_engine": "easyocr", "languages": self.languages}
        if self._images.is_pdf:
            extra["dpi"] = self.dpi
        return ExtractorMetadata(
            extractor_type=ExtractorType.EASYOCR,
            extra=extra,
        )

    def get_init_params(self) -> dict[str, Any]:
        """Return parameters for recreating this extractor in a worker process."""
        return {
            "path": self.path,
            "languages": self.languages,
            "gpu": self.gpu,
            "dpi": self.dpi,
            "output_unit": self.output_unit,
        }

    def close(self) -> None:
        """Release resources."""
        self._images.close()

__init__

__init__(
    path: Path | str,
    languages: list[str] | None = None,
    gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize EasyOCR extractor.

Parameters:

Name Type Description Default
path Path | str

Path to the image or PDF file (Path object or string).

required
languages list[str] | None

List of language codes for OCR. Defaults to ["en"].

None
gpu bool

Whether to use GPU acceleration.

False
dpi int

DPI for PDF-to-image conversion. Default 200.

200
output_unit CoordinateUnit

Coordinate unit for output. Default POINTS.

POINTS
Source code in unifex/ocr/extractors/easy_ocr.py
def __init__(
    self,
    path: Path | str,
    languages: list[str] | None = None,
    gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize EasyOCR extractor.

    Args:
        path: Path to the image or PDF file (Path object or string).
        languages: List of language codes for OCR. Defaults to ["en"].
        gpu: Whether to use GPU acceleration.
        dpi: DPI for PDF-to-image conversion. Default 200.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_easyocr_installed()
    super().__init__(path, output_unit)
    self.languages = languages or ["en"]
    self.gpu = gpu
    self.dpi = dpi

    # Compose components
    self._images = ImageLoader(self.path, dpi)
    self._adapter = EasyOCRAdapter()

get_page_count

get_page_count() -> int

Return number of pages/images loaded.

Source code in unifex/ocr/extractors/easy_ocr.py
def get_page_count(self) -> int:
    """Return number of pages/images loaded."""
    return self._images.page_count

extract_page

extract_page(page: int) -> PageExtractionResult

Extract text from a single image/page.

Source code in unifex/ocr/extractors/easy_ocr.py
def extract_page(self, page: int) -> PageExtractionResult:
    """Extract text from a single image/page."""
    import numpy as np

    try:
        img = self._images.get_page(page)
        width, height = img.size

        # Run OCR pipeline
        reader = get_reader(self.languages, self.gpu)
        results = reader.readtext(np.array(img))
        text_blocks = self._adapter.convert_result(results)

        result_page = Page(
            page=page,
            width=float(width),
            height=float(height),
            texts=text_blocks,
        )

        # Convert from native PIXELS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
        return PageExtractionResult(page=result_page, success=True)

    except Exception as e:
        logger.warning("Failed to extract page %d: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

get_extractor_metadata

get_extractor_metadata() -> ExtractorMetadata

Return extractor metadata.

Source code in unifex/ocr/extractors/easy_ocr.py
def get_extractor_metadata(self) -> ExtractorMetadata:
    """Return extractor metadata."""
    extra = {"ocr_engine": "easyocr", "languages": self.languages}
    if self._images.is_pdf:
        extra["dpi"] = self.dpi
    return ExtractorMetadata(
        extractor_type=ExtractorType.EASYOCR,
        extra=extra,
    )

get_init_params

get_init_params() -> dict[str, Any]

Return parameters for recreating this extractor in a worker process.

Source code in unifex/ocr/extractors/easy_ocr.py
def get_init_params(self) -> dict[str, Any]:
    """Return parameters for recreating this extractor in a worker process."""
    return {
        "path": self.path,
        "languages": self.languages,
        "gpu": self.gpu,
        "dpi": self.dpi,
        "output_unit": self.output_unit,
    }

close

close() -> None

Release resources.

Source code in unifex/ocr/extractors/easy_ocr.py
def close(self) -> None:
    """Release resources."""
    self._images.close()

TesseractOcrExtractor

OCR using Tesseract.

unifex.ocr.extractors.tesseract_ocr.TesseractOcrExtractor

Bases: BaseExtractor

Extract text from images or PDFs using Tesseract OCR.

Composes ImageLoader for image handling, Tesseract for OCR processing, and TesseractAdapter for result conversion.

Source code in unifex/ocr/extractors/tesseract_ocr.py
class TesseractOcrExtractor(BaseExtractor):
    """Extract text from images or PDFs using Tesseract OCR.

    Composes ImageLoader for image handling, Tesseract for OCR processing,
    and TesseractAdapter for result conversion.
    """

    def __init__(
        self,
        path: Path | str,
        languages: list[str] | None = None,
        dpi: int = 200,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize Tesseract OCR extractor.

        Args:
            path: Path to the image or PDF file (Path object or string).
            languages: List of 2-letter ISO 639-1 language codes (e.g., ["en", "fr"]).
                       Defaults to ["en"]. Codes are converted to Tesseract format internally.
            dpi: DPI for PDF-to-image conversion. Default 200.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_pytesseract_installed()
        super().__init__(path, output_unit)
        input_languages = languages or ["en"]
        # Store original 2-letter codes for metadata
        self.languages = input_languages
        # Convert to Tesseract format for internal use
        self._tesseract_languages = [_convert_lang_code(lang) for lang in input_languages]
        self.dpi = dpi

        # Compose components
        self._images = ImageLoader(self.path, dpi)
        self._adapter = TesseractAdapter()

    def get_page_count(self) -> int:
        """Return number of pages/images loaded."""
        return self._images.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract text from a single image/page."""
        import pytesseract

        try:
            img = self._images.get_page(page)
            width, height = img.size

            # Run OCR pipeline
            lang_str = "+".join(self._tesseract_languages)
            data = pytesseract.image_to_data(
                img, lang=lang_str, output_type=pytesseract.Output.DICT
            )
            text_blocks = self._adapter.convert_result(data)

            result_page = Page(
                page=page,
                width=float(width),
                height=float(height),
                texts=text_blocks,
            )

            # Convert from native PIXELS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
            return PageExtractionResult(page=result_page, success=True)

        except Exception as e:
            logger.warning("Failed to extract page %d: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        """Return extractor metadata."""
        extra = {"ocr_engine": "tesseract", "languages": self.languages}
        if self._images.is_pdf:
            extra["dpi"] = self.dpi
        return ExtractorMetadata(
            extractor_type=ExtractorType.TESSERACT,
            extra=extra,
        )

    def get_init_params(self) -> dict[str, Any]:
        """Return parameters for recreating this extractor in a worker process."""
        return {
            "path": self.path,
            "languages": self.languages,
            "dpi": self.dpi,
            "output_unit": self.output_unit,
        }

    def close(self) -> None:
        """Release resources."""
        self._images.close()

__init__

__init__(
    path: Path | str,
    languages: list[str] | None = None,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize Tesseract OCR extractor.

Parameters:

Name Type Description Default
path Path | str

Path to the image or PDF file (Path object or string).

required
languages list[str] | None

List of 2-letter ISO 639-1 language codes (e.g., ["en", "fr"]). Defaults to ["en"]. Codes are converted to Tesseract format internally.

None
dpi int

DPI for PDF-to-image conversion. Default 200.

200
output_unit CoordinateUnit

Coordinate unit for output. Default POINTS.

POINTS
Source code in unifex/ocr/extractors/tesseract_ocr.py
def __init__(
    self,
    path: Path | str,
    languages: list[str] | None = None,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize Tesseract OCR extractor.

    Args:
        path: Path to the image or PDF file (Path object or string).
        languages: List of 2-letter ISO 639-1 language codes (e.g., ["en", "fr"]).
                   Defaults to ["en"]. Codes are converted to Tesseract format internally.
        dpi: DPI for PDF-to-image conversion. Default 200.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_pytesseract_installed()
    super().__init__(path, output_unit)
    input_languages = languages or ["en"]
    # Store original 2-letter codes for metadata
    self.languages = input_languages
    # Convert to Tesseract format for internal use
    self._tesseract_languages = [_convert_lang_code(lang) for lang in input_languages]
    self.dpi = dpi

    # Compose components
    self._images = ImageLoader(self.path, dpi)
    self._adapter = TesseractAdapter()

get_page_count

get_page_count() -> int

Return number of pages/images loaded.

Source code in unifex/ocr/extractors/tesseract_ocr.py
def get_page_count(self) -> int:
    """Return number of pages/images loaded."""
    return self._images.page_count

extract_page

extract_page(page: int) -> PageExtractionResult

Extract text from a single image/page.

Source code in unifex/ocr/extractors/tesseract_ocr.py
def extract_page(self, page: int) -> PageExtractionResult:
    """Extract text from a single image/page."""
    import pytesseract

    try:
        img = self._images.get_page(page)
        width, height = img.size

        # Run OCR pipeline
        lang_str = "+".join(self._tesseract_languages)
        data = pytesseract.image_to_data(
            img, lang=lang_str, output_type=pytesseract.Output.DICT
        )
        text_blocks = self._adapter.convert_result(data)

        result_page = Page(
            page=page,
            width=float(width),
            height=float(height),
            texts=text_blocks,
        )

        # Convert from native PIXELS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
        return PageExtractionResult(page=result_page, success=True)

    except Exception as e:
        logger.warning("Failed to extract page %d: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

get_extractor_metadata

get_extractor_metadata() -> ExtractorMetadata

Return extractor metadata.

Source code in unifex/ocr/extractors/tesseract_ocr.py
def get_extractor_metadata(self) -> ExtractorMetadata:
    """Return extractor metadata."""
    extra = {"ocr_engine": "tesseract", "languages": self.languages}
    if self._images.is_pdf:
        extra["dpi"] = self.dpi
    return ExtractorMetadata(
        extractor_type=ExtractorType.TESSERACT,
        extra=extra,
    )

get_init_params

get_init_params() -> dict[str, Any]

Return parameters for recreating this extractor in a worker process.

Source code in unifex/ocr/extractors/tesseract_ocr.py
def get_init_params(self) -> dict[str, Any]:
    """Return parameters for recreating this extractor in a worker process."""
    return {
        "path": self.path,
        "languages": self.languages,
        "dpi": self.dpi,
        "output_unit": self.output_unit,
    }

close

close() -> None

Release resources.

Source code in unifex/ocr/extractors/tesseract_ocr.py
def close(self) -> None:
    """Release resources."""
    self._images.close()

PaddleOcrExtractor

OCR using PaddleOCR.

unifex.ocr.extractors.paddle_ocr.PaddleOcrExtractor

Bases: BaseExtractor

Extract text from images or PDFs using PaddleOCR.

Composes ImageLoader for image handling, PaddleOCR for OCR, and PaddleOCRAdapter for result conversion.

PaddleOCR model is loaded lazily on first extraction and cached globally.

Source code in unifex/ocr/extractors/paddle_ocr.py
class PaddleOcrExtractor(BaseExtractor):
    """Extract text from images or PDFs using PaddleOCR.

    Composes ImageLoader for image handling, PaddleOCR for OCR,
    and PaddleOCRAdapter for result conversion.

    PaddleOCR model is loaded lazily on first extraction and cached globally.
    """

    def __init__(
        self,
        path: Path | str,
        lang: str = "en",
        use_gpu: bool = False,
        dpi: int = 200,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize PaddleOCR extractor.

        Args:
            path: Path to the image or PDF file (Path object or string).
            lang: Language code for OCR. Common values:
                  - "en" for English
                  - "ch" for Chinese
                  - "fr" for French
                  - "german" for German
                  - "japan" for Japanese
                  - "korean" for Korean
                  See PaddleOCR docs for full list.
            use_gpu: Whether to use GPU acceleration.
            dpi: DPI for PDF-to-image conversion. Default 200.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_paddleocr_installed()
        super().__init__(path, output_unit)
        self.lang = lang
        self.use_gpu = use_gpu
        self.dpi = dpi

        # Compose components (lazy - OCR loaded on first use)
        self._images = ImageLoader(self.path, dpi)
        self._adapter = PaddleOCRAdapter()

    def get_page_count(self) -> int:
        """Return number of pages/images loaded."""
        return self._images.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract text from a single image/page."""
        import numpy as np

        try:
            img = self._images.get_page(page)
            width, height = img.size

            # Run OCR pipeline (lazy load model)
            ocr = get_paddle_ocr(self.lang, self.use_gpu)
            img_array = np.array(img)

            # Use version-specific API
            major_version = _get_paddle_major_version()
            if major_version >= PADDLEOCR_V3_MAJOR:
                result = ocr.predict(img_array)
            else:
                result = ocr.ocr(img_array, cls=True)

            text_blocks = self._adapter.convert_result(result, major_version)

            result_page = Page(
                page=page,
                width=float(width),
                height=float(height),
                texts=text_blocks,
            )

            # Convert from native PIXELS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
            return PageExtractionResult(page=result_page, success=True)

        except Exception as e:
            logger.warning("Failed to extract page %d: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def extract_tables(
        self,
        pages: list[int] | None = None,
    ) -> list[Table]:
        """Extract tables from document using PPStructure.

        Args:
            pages: List of page numbers to extract (0-indexed).
                   If None, extracts from all pages.

        Returns:
            List of Table objects with page field indicating source page.
        """
        import numpy as np

        if pages is None:
            pages = list(range(self.get_page_count()))

        all_tables: list[Table] = []
        engine = get_ppstructure(self.lang, self.use_gpu)

        for page_num in pages:
            try:
                img = self._images.get_page(page_num)
                img_array = np.array(img)

                # PPStructure returns list of layout elements
                result = engine(img_array)

                for element in result:
                    if element.get("type") == "table":
                        table = self._adapter.convert_table_result(element, page=page_num)
                        all_tables.append(table)

            except Exception as e:
                logger.warning("Failed to extract tables from page %d: %s", page_num, e)

        return all_tables

    def get_extractor_metadata(self) -> ExtractorMetadata:
        """Return extractor metadata."""
        extra = {"ocr_engine": "paddleocr", "languages": self.lang}
        if self._images.is_pdf:
            extra["dpi"] = self.dpi
        return ExtractorMetadata(
            extractor_type=ExtractorType.PADDLE,
            extra=extra,
        )

    def get_init_params(self) -> dict[str, Any]:
        """Return parameters for recreating this extractor in a worker process."""
        return {
            "path": self.path,
            "lang": self.lang,
            "use_gpu": self.use_gpu,
            "dpi": self.dpi,
            "output_unit": self.output_unit,
        }

    def close(self) -> None:
        """Release resources."""
        self._images.close()

__init__

__init__(
    path: Path | str,
    lang: str = "en",
    use_gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize PaddleOCR extractor.

Parameters:

Name Type Description Default
path Path | str

Path to the image or PDF file (Path object or string).

required
lang str

Language code for OCR. Common values: - "en" for English - "ch" for Chinese - "fr" for French - "german" for German - "japan" for Japanese - "korean" for Korean See PaddleOCR docs for full list.

'en'
use_gpu bool

Whether to use GPU acceleration.

False
dpi int

DPI for PDF-to-image conversion. Default 200.

200
output_unit CoordinateUnit

Coordinate unit for output. Default POINTS.

POINTS
Source code in unifex/ocr/extractors/paddle_ocr.py
def __init__(
    self,
    path: Path | str,
    lang: str = "en",
    use_gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize PaddleOCR extractor.

    Args:
        path: Path to the image or PDF file (Path object or string).
        lang: Language code for OCR. Common values:
              - "en" for English
              - "ch" for Chinese
              - "fr" for French
              - "german" for German
              - "japan" for Japanese
              - "korean" for Korean
              See PaddleOCR docs for full list.
        use_gpu: Whether to use GPU acceleration.
        dpi: DPI for PDF-to-image conversion. Default 200.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_paddleocr_installed()
    super().__init__(path, output_unit)
    self.lang = lang
    self.use_gpu = use_gpu
    self.dpi = dpi

    # Compose components (lazy - OCR loaded on first use)
    self._images = ImageLoader(self.path, dpi)
    self._adapter = PaddleOCRAdapter()

get_page_count

get_page_count() -> int

Return number of pages/images loaded.

Source code in unifex/ocr/extractors/paddle_ocr.py
def get_page_count(self) -> int:
    """Return number of pages/images loaded."""
    return self._images.page_count

extract_page

extract_page(page: int) -> PageExtractionResult

Extract text from a single image/page.

Source code in unifex/ocr/extractors/paddle_ocr.py
def extract_page(self, page: int) -> PageExtractionResult:
    """Extract text from a single image/page."""
    import numpy as np

    try:
        img = self._images.get_page(page)
        width, height = img.size

        # Run OCR pipeline (lazy load model)
        ocr = get_paddle_ocr(self.lang, self.use_gpu)
        img_array = np.array(img)

        # Use version-specific API
        major_version = _get_paddle_major_version()
        if major_version >= PADDLEOCR_V3_MAJOR:
            result = ocr.predict(img_array)
        else:
            result = ocr.ocr(img_array, cls=True)

        text_blocks = self._adapter.convert_result(result, major_version)

        result_page = Page(
            page=page,
            width=float(width),
            height=float(height),
            texts=text_blocks,
        )

        # Convert from native PIXELS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
        return PageExtractionResult(page=result_page, success=True)

    except Exception as e:
        logger.warning("Failed to extract page %d: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

extract_tables

extract_tables(
    pages: list[int] | None = None,
) -> list[Table]

Extract tables from document using PPStructure.

Parameters:

Name Type Description Default
pages list[int] | None

List of page numbers to extract (0-indexed). If None, extracts from all pages.

None

Returns:

Type Description
list[Table]

List of Table objects with page field indicating source page.

Source code in unifex/ocr/extractors/paddle_ocr.py
def extract_tables(
    self,
    pages: list[int] | None = None,
) -> list[Table]:
    """Extract tables from document using PPStructure.

    Args:
        pages: List of page numbers to extract (0-indexed).
               If None, extracts from all pages.

    Returns:
        List of Table objects with page field indicating source page.
    """
    import numpy as np

    if pages is None:
        pages = list(range(self.get_page_count()))

    all_tables: list[Table] = []
    engine = get_ppstructure(self.lang, self.use_gpu)

    for page_num in pages:
        try:
            img = self._images.get_page(page_num)
            img_array = np.array(img)

            # PPStructure returns list of layout elements
            result = engine(img_array)

            for element in result:
                if element.get("type") == "table":
                    table = self._adapter.convert_table_result(element, page=page_num)
                    all_tables.append(table)

        except Exception as e:
            logger.warning("Failed to extract tables from page %d: %s", page_num, e)

    return all_tables

get_extractor_metadata

get_extractor_metadata() -> ExtractorMetadata

Return extractor metadata.

Source code in unifex/ocr/extractors/paddle_ocr.py
def get_extractor_metadata(self) -> ExtractorMetadata:
    """Return extractor metadata."""
    extra = {"ocr_engine": "paddleocr", "languages": self.lang}
    if self._images.is_pdf:
        extra["dpi"] = self.dpi
    return ExtractorMetadata(
        extractor_type=ExtractorType.PADDLE,
        extra=extra,
    )

get_init_params

get_init_params() -> dict[str, Any]

Return parameters for recreating this extractor in a worker process.

Source code in unifex/ocr/extractors/paddle_ocr.py
def get_init_params(self) -> dict[str, Any]:
    """Return parameters for recreating this extractor in a worker process."""
    return {
        "path": self.path,
        "lang": self.lang,
        "use_gpu": self.use_gpu,
        "dpi": self.dpi,
        "output_unit": self.output_unit,
    }

close

close() -> None

Release resources.

Source code in unifex/ocr/extractors/paddle_ocr.py
def close(self) -> None:
    """Release resources."""
    self._images.close()

Cloud OCR Extractors

AzureDocumentIntelligenceExtractor

Azure Document Intelligence OCR.

unifex.ocr.extractors.azure_di.AzureDocumentIntelligenceExtractor

Bases: BaseExtractor

Extract text from documents using Azure Document Intelligence.

Source code in unifex/ocr/extractors/azure_di.py
class AzureDocumentIntelligenceExtractor(BaseExtractor):
    """Extract text from documents using Azure Document Intelligence."""

    def __init__(
        self,
        path: Path | str,
        endpoint: str,
        key: str,
        model_id: str = "prebuilt-read",
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        _check_azure_installed()
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.core.credentials import AzureKeyCredential

        super().__init__(path, output_unit)
        self.endpoint = endpoint
        self.model_id = model_id
        self._client: DocumentIntelligenceClient = DocumentIntelligenceClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key),
        )
        self._result: Any | None = None
        self._adapter: AzureDocumentIntelligenceAdapter | None = None
        self._analyze_document()

    def _analyze_document(self) -> None:
        """Send document to Azure DI for analysis."""
        try:
            with open(self.path, "rb") as f:
                poller = self._client.begin_analyze_document(
                    model_id=self.model_id,
                    body=f,
                    content_type="application/octet-stream",
                )
                self._result = poller.result()
                self._adapter = AzureDocumentIntelligenceAdapter(self._result, self.model_id)
        except (OSError, ValueError) as e:
            logger.warning("Failed to analyze document with Azure DI: %s", e)
            self._result = None
            self._adapter = AzureDocumentIntelligenceAdapter(None, self.model_id)

    def get_page_count(self) -> int:
        if self._adapter is None:
            return 0
        return self._adapter.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract a single page by number (0-indexed)."""
        try:
            if self._adapter is None:
                raise ValueError("Document analysis failed")

            converted_page = self._adapter.convert_page(page)
            # Convert from native INCHES to output_unit
            converted_page = self._convert_page(converted_page, CoordinateUnit.INCHES)
            return PageExtractionResult(page=converted_page, success=True)

        except (IndexError, ValueError, AttributeError) as e:
            logger.warning("Failed to extract page %d from Azure DI result: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        if self._adapter is None:
            return AzureDocumentIntelligenceAdapter(None, self.model_id).get_metadata()
        return self._adapter.get_metadata()

    def close(self) -> None:
        if self._client is not None:
            self._client.close()

extract_page

extract_page(page: int) -> PageExtractionResult

Extract a single page by number (0-indexed).

Source code in unifex/ocr/extractors/azure_di.py
def extract_page(self, page: int) -> PageExtractionResult:
    """Extract a single page by number (0-indexed)."""
    try:
        if self._adapter is None:
            raise ValueError("Document analysis failed")

        converted_page = self._adapter.convert_page(page)
        # Convert from native INCHES to output_unit
        converted_page = self._convert_page(converted_page, CoordinateUnit.INCHES)
        return PageExtractionResult(page=converted_page, success=True)

    except (IndexError, ValueError, AttributeError) as e:
        logger.warning("Failed to extract page %d from Azure DI result: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

GoogleDocumentAIExtractor

Google Document AI OCR.

unifex.ocr.extractors.google_docai.GoogleDocumentAIExtractor

Bases: BaseExtractor

Extract text from documents using Google Document AI.

Source code in unifex/ocr/extractors/google_docai.py
class GoogleDocumentAIExtractor(BaseExtractor):
    """Extract text from documents using Google Document AI."""

    def __init__(
        self,
        path: Path | str,
        processor_name: str,
        credentials_path: str,
        mime_type: str | None = None,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize Google Document AI extractor.

        Args:
            path: Path to the document file.
            processor_name: Full processor resource name, e.g.,
                'projects/{project}/locations/{location}/processors/{processor_id}'
            credentials_path: Path to service account JSON credentials file.
            mime_type: Optional MIME type. If not provided, will be inferred from file extension.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_google_docai_installed()
        from google.cloud import documentai
        from google.oauth2 import service_account

        super().__init__(path, output_unit)
        self.processor_name = processor_name
        self.credentials_path = credentials_path
        self.mime_type = mime_type or self._infer_mime_type()

        # Create credentials from service account file
        credentials = service_account.Credentials.from_service_account_file(credentials_path)

        # Extract location from processor name for endpoint
        # Format: projects/{project}/locations/{location}/processors/{processor_id}
        location = self._extract_location_from_processor_name(processor_name)
        opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

        self._client = documentai.DocumentProcessorServiceClient(
            credentials=credentials, client_options=opts
        )
        self._document: Document | None = None
        self._adapter: GoogleDocumentAIAdapter | None = None
        self._process_document()

    def _infer_mime_type(self) -> str:
        """Infer MIME type from file extension."""
        suffix = self.path.suffix.lower()
        mime_types = {
            ".pdf": "application/pdf",
            ".png": "image/png",
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".tiff": "image/tiff",
            ".tif": "image/tiff",
            ".gif": "image/gif",
            ".bmp": "image/bmp",
            ".webp": "image/webp",
        }
        return mime_types.get(suffix, "application/pdf")

    @staticmethod
    def _extract_location_from_processor_name(processor_name: str) -> str:
        """Extract location from processor resource name."""
        # Format: projects/{project}/locations/{location}/processors/{processor_id}
        parts = processor_name.split("/")
        try:
            loc_index = parts.index("locations")
            return parts[loc_index + 1]
        except (ValueError, IndexError):
            return "us"  # Default to US

    def _process_document(self) -> None:
        """Send document to Google Document AI for processing."""
        from google.cloud import documentai

        try:
            with open(self.path, "rb") as f:
                content = f.read()

            raw_document = documentai.RawDocument(
                content=content,
                mime_type=self.mime_type,
            )

            request = documentai.ProcessRequest(
                name=self.processor_name,
                raw_document=raw_document,
            )

            result = self._client.process_document(request=request)
            self._document = result.document
            self._adapter = GoogleDocumentAIAdapter(self._document, self.processor_name)

        except (OSError, ValueError, Exception) as e:
            logger.warning("Failed to process document with Google Document AI: %s", e)
            self._document = None
            self._adapter = GoogleDocumentAIAdapter(None, self.processor_name)

    def get_page_count(self) -> int:
        if self._adapter is None:
            return 0
        return self._adapter.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract a single page by number (0-indexed)."""
        try:
            if self._adapter is None:
                raise ValueError("Document processing failed")

            converted_page = self._adapter.convert_page(page)
            # Google DocAI outputs pixels after denormalization
            # Use 72 DPI as standard PDF resolution for conversion
            converted_page = self._convert_page(converted_page, CoordinateUnit.PIXELS, dpi=72.0)
            return PageExtractionResult(page=converted_page, success=True)

        except (IndexError, ValueError, AttributeError) as e:
            logger.warning("Failed to extract page %d from Google Document AI result: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        if self._adapter is None:
            return GoogleDocumentAIAdapter(None, self.processor_name).get_metadata()
        return self._adapter.get_metadata()

    def close(self) -> None:
        if self._client is not None:
            self._client.transport.close()

__init__

__init__(
    path: Path | str,
    processor_name: str,
    credentials_path: str,
    mime_type: str | None = None,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize Google Document AI extractor.

Parameters:

Name Type Description Default
path Path | str

Path to the document file.

required
processor_name str

Full processor resource name, e.g., 'projects/{project}/locations/{location}/processors/{processor_id}'

required
credentials_path str

Path to service account JSON credentials file.

required
mime_type str | None

Optional MIME type. If not provided, will be inferred from file extension.

None
output_unit CoordinateUnit

Coordinate unit for output. Default POINTS.

POINTS
Source code in unifex/ocr/extractors/google_docai.py
def __init__(
    self,
    path: Path | str,
    processor_name: str,
    credentials_path: str,
    mime_type: str | None = None,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize Google Document AI extractor.

    Args:
        path: Path to the document file.
        processor_name: Full processor resource name, e.g.,
            'projects/{project}/locations/{location}/processors/{processor_id}'
        credentials_path: Path to service account JSON credentials file.
        mime_type: Optional MIME type. If not provided, will be inferred from file extension.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_google_docai_installed()
    from google.cloud import documentai
    from google.oauth2 import service_account

    super().__init__(path, output_unit)
    self.processor_name = processor_name
    self.credentials_path = credentials_path
    self.mime_type = mime_type or self._infer_mime_type()

    # Create credentials from service account file
    credentials = service_account.Credentials.from_service_account_file(credentials_path)

    # Extract location from processor name for endpoint
    # Format: projects/{project}/locations/{location}/processors/{processor_id}
    location = self._extract_location_from_processor_name(processor_name)
    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    self._client = documentai.DocumentProcessorServiceClient(
        credentials=credentials, client_options=opts
    )
    self._document: Document | None = None
    self._adapter: GoogleDocumentAIAdapter | None = None
    self._process_document()

extract_page

extract_page(page: int) -> PageExtractionResult

Extract a single page by number (0-indexed).

Source code in unifex/ocr/extractors/google_docai.py
def extract_page(self, page: int) -> PageExtractionResult:
    """Extract a single page by number (0-indexed)."""
    try:
        if self._adapter is None:
            raise ValueError("Document processing failed")

        converted_page = self._adapter.convert_page(page)
        # Google DocAI outputs pixels after denormalization
        # Use 72 DPI as standard PDF resolution for conversion
        converted_page = self._convert_page(converted_page, CoordinateUnit.PIXELS, dpi=72.0)
        return PageExtractionResult(page=converted_page, success=True)

    except (IndexError, ValueError, AttributeError) as e:
        logger.warning("Failed to extract page %d from Google Document AI result: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

LLM Extractors

extract_structured

Synchronous LLM extraction function.

unifex.llm_factory.extract_structured

extract_structured(
    path: Path | str,
    model: str,
    *,
    schema: type[T],
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[T]
extract_structured(
    path: Path | str,
    model: str,
    *,
    schema: None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[dict[str, Any]]
extract_structured(
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: SingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]

Extract structured data from a document using an LLM.

All specified pages are sent in a single request.

Parameters:

Name Type Description Default
path Path | str

Path to document/image file.

required
model str

Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").

required
schema type[T] | None

Pydantic model for structured output. None for free-form dict.

None
prompt str | None

Custom extraction prompt. Auto-generated from schema if None.

None
pages list[int] | None

Page numbers to extract from (0-indexed). None for all pages.

None
dpi int

DPI for PDF-to-image conversion.

200
max_retries int

Max retry attempts with validation feedback.

3
temperature float

Sampling temperature (0.0 = deterministic).

0.0
credentials dict[str, str] | None

Override credentials dict (otherwise uses env vars).

None
base_url str | None

Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).

None
headers dict[str, str] | None

Custom HTTP headers for OpenAI-compatible APIs.

None
_extractor SingleExtractor[T] | None

Internal parameter for dependency injection (testing only).

None

Returns:

Type Description
LLMExtractionResult[T | dict[str, Any]]

LLMExtractionResult containing extracted data,

LLMExtractionResult[T | dict[str, Any]]

model info, and provider.

Source code in unifex/llm_factory.py
def extract_structured(  # noqa: PLR0913
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: SingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]:
    """Extract structured data from a document using an LLM.

    All specified pages are sent in a single request.

    Args:
        path: Path to document/image file.
        model: Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").
        schema: Pydantic model for structured output. None for free-form dict.
        prompt: Custom extraction prompt. Auto-generated from schema if None.
        pages: Page numbers to extract from (0-indexed). None for all pages.
        dpi: DPI for PDF-to-image conversion.
        max_retries: Max retry attempts with validation feedback.
        temperature: Sampling temperature (0.0 = deterministic).
        credentials: Override credentials dict (otherwise uses env vars).
        base_url: Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).
        headers: Custom HTTP headers for OpenAI-compatible APIs.
        _extractor: Internal parameter for dependency injection (testing only).

    Returns:
        [LLMExtractionResult][unifex.llm.models.LLMExtractionResult] containing extracted data,
        model info, and provider.
    """
    path = Path(path) if isinstance(path, str) else path
    extractor = _extractor or _extract_single

    return extractor(
        path,
        model,
        schema,
        prompt,
        pages,
        dpi,
        max_retries,
        temperature,
        credentials,
        base_url,
        headers,
    )

extract_structured_async

Asynchronous LLM extraction function.

unifex.llm_factory.extract_structured_async async

extract_structured_async(
    path: Path | str,
    model: str,
    *,
    schema: type[T],
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[T]
extract_structured_async(
    path: Path | str,
    model: str,
    *,
    schema: None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[dict[str, Any]]
extract_structured_async(
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: AsyncSingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]

Async version of extract_structured.

All specified pages are sent in a single request.

Parameters:

Name Type Description Default
path Path | str

Path to document/image file.

required
model str

Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").

required
schema type[T] | None

Pydantic model for structured output. None for free-form dict.

None
prompt str | None

Custom extraction prompt. Auto-generated from schema if None.

None
pages list[int] | None

Page numbers to extract from (0-indexed). None for all pages.

None
dpi int

DPI for PDF-to-image conversion.

200
max_retries int

Max retry attempts with validation feedback.

3
temperature float

Sampling temperature (0.0 = deterministic).

0.0
credentials dict[str, str] | None

Override credentials dict (otherwise uses env vars).

None
base_url str | None

Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).

None
headers dict[str, str] | None

Custom HTTP headers for OpenAI-compatible APIs.

None
_extractor AsyncSingleExtractor[T] | None

Internal parameter for dependency injection (testing only).

None

Returns:

Type Description
LLMExtractionResult[T | dict[str, Any]]

LLMExtractionResult containing extracted data,

LLMExtractionResult[T | dict[str, Any]]

model info, and provider.

Source code in unifex/llm_factory.py
async def extract_structured_async(  # noqa: PLR0913
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: AsyncSingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]:
    """Async version of extract_structured.

    All specified pages are sent in a single request.

    Args:
        path: Path to document/image file.
        model: Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").
        schema: Pydantic model for structured output. None for free-form dict.
        prompt: Custom extraction prompt. Auto-generated from schema if None.
        pages: Page numbers to extract from (0-indexed). None for all pages.
        dpi: DPI for PDF-to-image conversion.
        max_retries: Max retry attempts with validation feedback.
        temperature: Sampling temperature (0.0 = deterministic).
        credentials: Override credentials dict (otherwise uses env vars).
        base_url: Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).
        headers: Custom HTTP headers for OpenAI-compatible APIs.
        _extractor: Internal parameter for dependency injection (testing only).

    Returns:
        [LLMExtractionResult][unifex.llm.models.LLMExtractionResult] containing extracted data,
        model info, and provider.
    """
    path = Path(path) if isinstance(path, str) else path
    extractor = _extractor or _extract_single_async

    return await extractor(
        path,
        model,
        schema,
        prompt,
        pages,
        dpi,
        max_retries,
        temperature,
        credentials,
        base_url,
        headers,
    )