Extractors Reference¶

PDF Extractor¶

PdfExtractor¶

Native PDF text extraction using pypdfium2.

unifex.pdf.PdfExtractor ¶

Bases: BaseExtractor

Extract text and metadata from PDF files using pypdfium2.

Source code in unifex/pdf/pdf.py

class PdfExtractor(BaseExtractor):
    """Extract text and metadata from PDF files using pypdfium2."""

    def __init__(
        self,
        path: Path | str,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
        character_merger: CharacterMerger | None = None,
    ) -> None:
        super().__init__(path, output_unit)
        self._pdf = pdfium.PdfDocument(self.path)
        self._merger = character_merger if character_merger is not None else BasicLineMerger()
        self._lock = threading.Lock()

    def get_page_count(self) -> int:
        return len(self._pdf)

    def extract_page(
        self,
        page: int,
        table_options: dict[str, Any] | None = None,
    ) -> PageExtractionResult:
        """Extract a single page by number (0-indexed).

        Thread-safe: uses internal lock for parallel access.

        Args:
            page: Page number (0-indexed).
            table_options: Optional dict of tabula options for table extraction.
                If provided, tables will be extracted and added to Page.tables.
                Common options: lattice, stream, columns, area, guess, multiple_tables.
        """
        try:
            with self._lock:
                pdf_page = self._pdf[page]
                width, height = pdf_page.get_size()
                text_blocks = self._extract_text_blocks(pdf_page, height)

            tables: list[Table] = []
            if table_options is not None:
                tables = self._extract_tables_for_page(page, table_options)

            result_page = Page(
                page=page,
                width=width,
                height=height,
                texts=text_blocks,
                tables=tables,
            )
            # Convert from native POINTS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.POINTS)
            return PageExtractionResult(page=result_page, success=True)
        except Exception as e:
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        metadata_dict = {}
        try:
            for key in ["Title", "Author", "Creator", "Producer", "CreationDate", "ModDate"]:
                val = self._pdf.get_metadata_value(key)
                if val:
                    metadata_dict[key.lower()] = val
        except (KeyError, ValueError, pdfium.PdfiumError) as e:
            logger.warning("Failed to extract PDF metadata: %s", e)

        return ExtractorMetadata(
            extractor_type=ExtractorType.PDF,
            title=metadata_dict.get("title"),
            author=metadata_dict.get("author"),
            creator=metadata_dict.get("creator"),
            producer=metadata_dict.get("producer"),
            creation_date=metadata_dict.get("creationdate"),
            modification_date=metadata_dict.get("moddate"),
        )

    def close(self) -> None:
        self._pdf.close()

    def _extract_text_blocks(self, page: pdfium.PdfPage, page_height: float) -> list[TextBlock]:
        textpage = page.get_textpage()
        char_count = textpage.count_chars()
        if char_count == 0:
            return []

        # Batch text extraction (206x faster than per-char)
        all_text = textpage.get_text_range(0, char_count)

        # Check rotation support once, not per character
        has_rotation = hasattr(textpage, "get_char_rotation")

        chars: list[CharInfo] = []
        for i in range(char_count):
            bbox = textpage.get_charbox(i)
            rotation = textpage.get_char_rotation(i) if has_rotation else 0
            chars.append(CharInfo(char=all_text[i], bbox=bbox, rotation=rotation, index=i))

        return self._merger.merge(chars, textpage, page_height)

    def extract_tables(
        self,
        pages: Sequence[int] | None = None,
        table_options: dict[str, Any] | None = None,
    ) -> list[Table]:
        """Extract tables from PDF pages using tabula.

        Args:
            pages: Sequence of page numbers to extract (0-indexed).
                   If None, extracts from all pages.
            table_options: Dict of tabula options. Common options:
                - lattice: bool - Use lattice mode (tables with cell borders)
                - stream: bool - Use stream mode (tables without borders)
                - columns: list[float] - Column x-coordinates for splitting
                - area: tuple[float, float, float, float] - (top, left, bottom, right)
                - guess: bool - Guess table areas automatically
                - multiple_tables: bool - Extract multiple tables per page
                - pandas_options: dict - Options for pandas

        Returns:
            List of Table objects with page field indicating source page.
        """
        if pages is None:
            pages = range(self.get_page_count())

        options = table_options or {}
        all_tables: list[Table] = []

        for page_num in pages:
            page_tables = self._extract_tables_for_page(page_num, options)
            all_tables.extend(page_tables)

        return all_tables

    def _extract_tables_for_page(
        self,
        page: int,
        options: dict[str, Any],
    ) -> list[Table]:
        """Extract tables from a single page using tabula.

        Args:
            page: Page number (0-indexed).
            options: Tabula options dict.

        Returns:
            List of Table objects for this page.
        """
        try:
            import tabula
        except ImportError as e:
            raise ImportError(
                "tabula-py is required for table extraction. "
                "Install with: pip install 'unifex[tables]'"
            ) from e

        tabula_opts = self._build_tabula_options(page, options)
        dfs = tabula.read_pdf(str(self.path), **tabula_opts)

        return [self._dataframe_to_table(df, page) for df in dfs if not df.empty]

    def _build_tabula_options(self, page: int, options: dict[str, Any]) -> dict[str, Any]:
        """Build tabula options dict from user options."""
        # Tabula uses 1-indexed pages
        tabula_opts: dict[str, Any] = {
            "pages": page + 1,
            "multiple_tables": options.get("multiple_tables", True),
            "guess": options.get("guess", True),
        }

        # Copy optional settings
        for key in ("lattice", "stream", "columns", "area", "pandas_options"):
            if options.get(key):
                tabula_opts[key] = options[key]

        return tabula_opts

    def _dataframe_to_table(self, df: Any, page: int) -> Table:
        """Convert a pandas DataFrame to a Table model."""
        cells: list[TableCell] = []
        row_count = len(df)
        col_count = len(df.columns)

        # Add header row (column names)
        for col_idx, col_name in enumerate(df.columns):
            cell_text = str(col_name) if col_name is not None else ""
            cells.append(TableCell(text=cell_text, row=0, col=col_idx))

        # Add data rows
        for row_idx, row in enumerate(df.itertuples(index=False), start=1):
            for col_idx, value in enumerate(row):
                cell_text = str(value) if value is not None and str(value) != "nan" else ""
                cells.append(TableCell(text=cell_text, row=row_idx, col=col_idx))

        return Table(
            page=page,
            cells=cells,
            row_count=row_count + 1,  # +1 for header row
            col_count=col_count,
        )

extract_page ¶

extract_page(
    page: int, table_options: dict[str, Any] | None = None
) -> PageExtractionResult

Extract a single page by number (0-indexed).

Thread-safe: uses internal lock for parallel access.

Parameters:

Name	Type	Description	Default
`page`	`int`	Page number (0-indexed).	required
`table_options`	`dict[str, Any] \| None`	Optional dict of tabula options for table extraction. If provided, tables will be extracted and added to Page.tables. Common options: lattice, stream, columns, area, guess, multiple_tables.	`None`

Source code in unifex/pdf/pdf.py

def extract_page(
    self,
    page: int,
    table_options: dict[str, Any] | None = None,
) -> PageExtractionResult:
    """Extract a single page by number (0-indexed).

    Thread-safe: uses internal lock for parallel access.

    Args:
        page: Page number (0-indexed).
        table_options: Optional dict of tabula options for table extraction.
            If provided, tables will be extracted and added to Page.tables.
            Common options: lattice, stream, columns, area, guess, multiple_tables.
    """
    try:
        with self._lock:
            pdf_page = self._pdf[page]
            width, height = pdf_page.get_size()
            text_blocks = self._extract_text_blocks(pdf_page, height)

        tables: list[Table] = []
        if table_options is not None:
            tables = self._extract_tables_for_page(page, table_options)

        result_page = Page(
            page=page,
            width=width,
            height=height,
            texts=text_blocks,
            tables=tables,
        )
        # Convert from native POINTS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.POINTS)
        return PageExtractionResult(page=result_page, success=True)
    except Exception as e:
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

extract_tables ¶

extract_tables(
    pages: Sequence[int] | None = None,
    table_options: dict[str, Any] | None = None,
) -> list[Table]

Extract tables from PDF pages using tabula.

Parameters:

Name	Type	Description	Default
`pages`	`Sequence[int] \| None`	Sequence of page numbers to extract (0-indexed). If None, extracts from all pages.	`None`
`table_options`	`dict[str, Any] \| None`	Dict of tabula options. Common options: - lattice: bool - Use lattice mode (tables with cell borders) - stream: bool - Use stream mode (tables without borders) - columns: list[float] - Column x-coordinates for splitting - area: tuple[float, float, float, float] - (top, left, bottom, right) - guess: bool - Guess table areas automatically - multiple_tables: bool - Extract multiple tables per page - pandas_options: dict - Options for pandas	`None`

Returns:

Type	Description
`list[Table]`	List of Table objects with page field indicating source page.

Source code in unifex/pdf/pdf.py

def extract_tables(
    self,
    pages: Sequence[int] | None = None,
    table_options: dict[str, Any] | None = None,
) -> list[Table]:
    """Extract tables from PDF pages using tabula.

    Args:
        pages: Sequence of page numbers to extract (0-indexed).
               If None, extracts from all pages.
        table_options: Dict of tabula options. Common options:
            - lattice: bool - Use lattice mode (tables with cell borders)
            - stream: bool - Use stream mode (tables without borders)
            - columns: list[float] - Column x-coordinates for splitting
            - area: tuple[float, float, float, float] - (top, left, bottom, right)
            - guess: bool - Guess table areas automatically
            - multiple_tables: bool - Extract multiple tables per page
            - pandas_options: dict - Options for pandas

    Returns:
        List of Table objects with page field indicating source page.
    """
    if pages is None:
        pages = range(self.get_page_count())

    options = table_options or {}
    all_tables: list[Table] = []

    for page_num in pages:
        page_tables = self._extract_tables_for_page(page_num, options)
        all_tables.extend(page_tables)

    return all_tables

Local OCR Extractors¶

EasyOcrExtractor¶

OCR using EasyOCR library.

unifex.ocr.extractors.easy_ocr.EasyOcrExtractor ¶

Bases: BaseExtractor

Extract text from images or PDFs using EasyOCR.

Composes ImageLoader for image handling, EasyOCR for OCR processing, and EasyOCRAdapter for result conversion.

Source code in unifex/ocr/extractors/easy_ocr.py

class EasyOcrExtractor(BaseExtractor):
    """Extract text from images or PDFs using EasyOCR.

    Composes ImageLoader for image handling, EasyOCR for OCR processing,
    and EasyOCRAdapter for result conversion.
    """

    def __init__(
        self,
        path: Path | str,
        languages: list[str] | None = None,
        gpu: bool = False,
        dpi: int = 200,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize EasyOCR extractor.

        Args:
            path: Path to the image or PDF file (Path object or string).
            languages: List of language codes for OCR. Defaults to ["en"].
            gpu: Whether to use GPU acceleration.
            dpi: DPI for PDF-to-image conversion. Default 200.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_easyocr_installed()
        super().__init__(path, output_unit)
        self.languages = languages or ["en"]
        self.gpu = gpu
        self.dpi = dpi

        # Compose components
        self._images = ImageLoader(self.path, dpi)
        self._adapter = EasyOCRAdapter()

    def get_page_count(self) -> int:
        """Return number of pages/images loaded."""
        return self._images.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract text from a single image/page."""
        import numpy as np

        try:
            img = self._images.get_page(page)
            width, height = img.size

            # Run OCR pipeline
            reader = get_reader(self.languages, self.gpu)
            results = reader.readtext(np.array(img))
            text_blocks = self._adapter.convert_result(results)

            result_page = Page(
                page=page,
                width=float(width),
                height=float(height),
                texts=text_blocks,
            )

            # Convert from native PIXELS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
            return PageExtractionResult(page=result_page, success=True)

        except Exception as e:
            logger.warning("Failed to extract page %d: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        """Return extractor metadata."""
        extra = {"ocr_engine": "easyocr", "languages": self.languages}
        if self._images.is_pdf:
            extra["dpi"] = self.dpi
        return ExtractorMetadata(
            extractor_type=ExtractorType.EASYOCR,
            extra=extra,
        )

    def get_init_params(self) -> dict[str, Any]:
        """Return parameters for recreating this extractor in a worker process."""
        return {
            "path": self.path,
            "languages": self.languages,
            "gpu": self.gpu,
            "dpi": self.dpi,
            "output_unit": self.output_unit,
        }

    def close(self) -> None:
        """Release resources."""
        self._images.close()

init ¶

__init__(
    path: Path | str,
    languages: list[str] | None = None,
    gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize EasyOCR extractor.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to the image or PDF file (Path object or string).	required
`languages`	`list[str] \| None`	List of language codes for OCR. Defaults to ["en"].	`None`
`gpu`	`bool`	Whether to use GPU acceleration.	`False`
`dpi`	`int`	DPI for PDF-to-image conversion. Default 200.	`200`
`output_unit`	`CoordinateUnit`	Coordinate unit for output. Default POINTS.	`POINTS`

Source code in unifex/ocr/extractors/easy_ocr.py

def __init__(
    self,
    path: Path | str,
    languages: list[str] | None = None,
    gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize EasyOCR extractor.

    Args:
        path: Path to the image or PDF file (Path object or string).
        languages: List of language codes for OCR. Defaults to ["en"].
        gpu: Whether to use GPU acceleration.
        dpi: DPI for PDF-to-image conversion. Default 200.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_easyocr_installed()
    super().__init__(path, output_unit)
    self.languages = languages or ["en"]
    self.gpu = gpu
    self.dpi = dpi

    # Compose components
    self._images = ImageLoader(self.path, dpi)
    self._adapter = EasyOCRAdapter()

get_page_count ¶

get_page_count() -> int

Return number of pages/images loaded.

Source code in unifex/ocr/extractors/easy_ocr.py

def get_page_count(self) -> int:
    """Return number of pages/images loaded."""
    return self._images.page_count

extract_page ¶

extract_page(page: int) -> PageExtractionResult

Extract text from a single image/page.

Source code in unifex/ocr/extractors/easy_ocr.py

def extract_page(self, page: int) -> PageExtractionResult:
    """Extract text from a single image/page."""
    import numpy as np

    try:
        img = self._images.get_page(page)
        width, height = img.size

        # Run OCR pipeline
        reader = get_reader(self.languages, self.gpu)
        results = reader.readtext(np.array(img))
        text_blocks = self._adapter.convert_result(results)

        result_page = Page(
            page=page,
            width=float(width),
            height=float(height),
            texts=text_blocks,
        )

        # Convert from native PIXELS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
        return PageExtractionResult(page=result_page, success=True)

    except Exception as e:
        logger.warning("Failed to extract page %d: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

get_extractor_metadata ¶

get_extractor_metadata() -> ExtractorMetadata

Return extractor metadata.

Source code in unifex/ocr/extractors/easy_ocr.py

def get_extractor_metadata(self) -> ExtractorMetadata:
    """Return extractor metadata."""
    extra = {"ocr_engine": "easyocr", "languages": self.languages}
    if self._images.is_pdf:
        extra["dpi"] = self.dpi
    return ExtractorMetadata(
        extractor_type=ExtractorType.EASYOCR,
        extra=extra,
    )

get_init_params ¶

get_init_params() -> dict[str, Any]

Return parameters for recreating this extractor in a worker process.

Source code in unifex/ocr/extractors/easy_ocr.py

def get_init_params(self) -> dict[str, Any]:
    """Return parameters for recreating this extractor in a worker process."""
    return {
        "path": self.path,
        "languages": self.languages,
        "gpu": self.gpu,
        "dpi": self.dpi,
        "output_unit": self.output_unit,
    }

close ¶

close() -> None

Release resources.

Source code in unifex/ocr/extractors/easy_ocr.py

def close(self) -> None:
    """Release resources."""
    self._images.close()

TesseractOcrExtractor¶

OCR using Tesseract.

unifex.ocr.extractors.tesseract_ocr.TesseractOcrExtractor ¶

Bases: BaseExtractor

Extract text from images or PDFs using Tesseract OCR.

Composes ImageLoader for image handling, Tesseract for OCR processing, and TesseractAdapter for result conversion.

Source code in unifex/ocr/extractors/tesseract_ocr.py

class TesseractOcrExtractor(BaseExtractor):
    """Extract text from images or PDFs using Tesseract OCR.

    Composes ImageLoader for image handling, Tesseract for OCR processing,
    and TesseractAdapter for result conversion.
    """

    def __init__(
        self,
        path: Path | str,
        languages: list[str] | None = None,
        dpi: int = 200,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize Tesseract OCR extractor.

        Args:
            path: Path to the image or PDF file (Path object or string).
            languages: List of 2-letter ISO 639-1 language codes (e.g., ["en", "fr"]).
                       Defaults to ["en"]. Codes are converted to Tesseract format internally.
            dpi: DPI for PDF-to-image conversion. Default 200.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_pytesseract_installed()
        super().__init__(path, output_unit)
        input_languages = languages or ["en"]
        # Store original 2-letter codes for metadata
        self.languages = input_languages
        # Convert to Tesseract format for internal use
        self._tesseract_languages = [_convert_lang_code(lang) for lang in input_languages]
        self.dpi = dpi

        # Compose components
        self._images = ImageLoader(self.path, dpi)
        self._adapter = TesseractAdapter()

    def get_page_count(self) -> int:
        """Return number of pages/images loaded."""
        return self._images.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract text from a single image/page."""
        import pytesseract

        try:
            img = self._images.get_page(page)
            width, height = img.size

            # Run OCR pipeline
            lang_str = "+".join(self._tesseract_languages)
            data = pytesseract.image_to_data(
                img, lang=lang_str, output_type=pytesseract.Output.DICT
            )
            text_blocks = self._adapter.convert_result(data)

            result_page = Page(
                page=page,
                width=float(width),
                height=float(height),
                texts=text_blocks,
            )

            # Convert from native PIXELS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
            return PageExtractionResult(page=result_page, success=True)

        except Exception as e:
            logger.warning("Failed to extract page %d: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        """Return extractor metadata."""
        extra = {"ocr_engine": "tesseract", "languages": self.languages}
        if self._images.is_pdf:
            extra["dpi"] = self.dpi
        return ExtractorMetadata(
            extractor_type=ExtractorType.TESSERACT,
            extra=extra,
        )

    def get_init_params(self) -> dict[str, Any]:
        """Return parameters for recreating this extractor in a worker process."""
        return {
            "path": self.path,
            "languages": self.languages,
            "dpi": self.dpi,
            "output_unit": self.output_unit,
        }

    def close(self) -> None:
        """Release resources."""
        self._images.close()

init ¶

__init__(
    path: Path | str,
    languages: list[str] | None = None,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize Tesseract OCR extractor.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to the image or PDF file (Path object or string).	required
`languages`	`list[str] \| None`	List of 2-letter ISO 639-1 language codes (e.g., ["en", "fr"]). Defaults to ["en"]. Codes are converted to Tesseract format internally.	`None`
`dpi`	`int`	DPI for PDF-to-image conversion. Default 200.	`200`
`output_unit`	`CoordinateUnit`	Coordinate unit for output. Default POINTS.	`POINTS`

Source code in unifex/ocr/extractors/tesseract_ocr.py

def __init__(
    self,
    path: Path | str,
    languages: list[str] | None = None,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize Tesseract OCR extractor.

    Args:
        path: Path to the image or PDF file (Path object or string).
        languages: List of 2-letter ISO 639-1 language codes (e.g., ["en", "fr"]).
                   Defaults to ["en"]. Codes are converted to Tesseract format internally.
        dpi: DPI for PDF-to-image conversion. Default 200.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_pytesseract_installed()
    super().__init__(path, output_unit)
    input_languages = languages or ["en"]
    # Store original 2-letter codes for metadata
    self.languages = input_languages
    # Convert to Tesseract format for internal use
    self._tesseract_languages = [_convert_lang_code(lang) for lang in input_languages]
    self.dpi = dpi

    # Compose components
    self._images = ImageLoader(self.path, dpi)
    self._adapter = TesseractAdapter()

get_page_count ¶

get_page_count() -> int

Return number of pages/images loaded.

Source code in unifex/ocr/extractors/tesseract_ocr.py

def get_page_count(self) -> int:
    """Return number of pages/images loaded."""
    return self._images.page_count

extract_page ¶

extract_page(page: int) -> PageExtractionResult

Extract text from a single image/page.

Source code in unifex/ocr/extractors/tesseract_ocr.py

def extract_page(self, page: int) -> PageExtractionResult:
    """Extract text from a single image/page."""
    import pytesseract

    try:
        img = self._images.get_page(page)
        width, height = img.size

        # Run OCR pipeline
        lang_str = "+".join(self._tesseract_languages)
        data = pytesseract.image_to_data(
            img, lang=lang_str, output_type=pytesseract.Output.DICT
        )
        text_blocks = self._adapter.convert_result(data)

        result_page = Page(
            page=page,
            width=float(width),
            height=float(height),
            texts=text_blocks,
        )

        # Convert from native PIXELS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
        return PageExtractionResult(page=result_page, success=True)

    except Exception as e:
        logger.warning("Failed to extract page %d: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

get_extractor_metadata ¶

get_extractor_metadata() -> ExtractorMetadata

Return extractor metadata.

Source code in unifex/ocr/extractors/tesseract_ocr.py

def get_extractor_metadata(self) -> ExtractorMetadata:
    """Return extractor metadata."""
    extra = {"ocr_engine": "tesseract", "languages": self.languages}
    if self._images.is_pdf:
        extra["dpi"] = self.dpi
    return ExtractorMetadata(
        extractor_type=ExtractorType.TESSERACT,
        extra=extra,
    )

get_init_params ¶

get_init_params() -> dict[str, Any]

Return parameters for recreating this extractor in a worker process.

Source code in unifex/ocr/extractors/tesseract_ocr.py

def get_init_params(self) -> dict[str, Any]:
    """Return parameters for recreating this extractor in a worker process."""
    return {
        "path": self.path,
        "languages": self.languages,
        "dpi": self.dpi,
        "output_unit": self.output_unit,
    }

close ¶

close() -> None

Release resources.

Source code in unifex/ocr/extractors/tesseract_ocr.py

def close(self) -> None:
    """Release resources."""
    self._images.close()

PaddleOcrExtractor¶

OCR using PaddleOCR.

unifex.ocr.extractors.paddle_ocr.PaddleOcrExtractor ¶

Bases: BaseExtractor

Extract text from images or PDFs using PaddleOCR.

Composes ImageLoader for image handling, PaddleOCR for OCR, and PaddleOCRAdapter for result conversion.

PaddleOCR model is loaded lazily on first extraction and cached globally.

Source code in unifex/ocr/extractors/paddle_ocr.py

class PaddleOcrExtractor(BaseExtractor):
    """Extract text from images or PDFs using PaddleOCR.

    Composes ImageLoader for image handling, PaddleOCR for OCR,
    and PaddleOCRAdapter for result conversion.

    PaddleOCR model is loaded lazily on first extraction and cached globally.
    """

    def __init__(
        self,
        path: Path | str,
        lang: str = "en",
        use_gpu: bool = False,
        dpi: int = 200,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize PaddleOCR extractor.

        Args:
            path: Path to the image or PDF file (Path object or string).
            lang: Language code for OCR. Common values:
                  - "en" for English
                  - "ch" for Chinese
                  - "fr" for French
                  - "german" for German
                  - "japan" for Japanese
                  - "korean" for Korean
                  See PaddleOCR docs for full list.
            use_gpu: Whether to use GPU acceleration.
            dpi: DPI for PDF-to-image conversion. Default 200.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_paddleocr_installed()
        super().__init__(path, output_unit)
        self.lang = lang
        self.use_gpu = use_gpu
        self.dpi = dpi

        # Compose components (lazy - OCR loaded on first use)
        self._images = ImageLoader(self.path, dpi)
        self._adapter = PaddleOCRAdapter()

    def get_page_count(self) -> int:
        """Return number of pages/images loaded."""
        return self._images.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract text from a single image/page."""
        import numpy as np

        try:
            img = self._images.get_page(page)
            width, height = img.size

            # Run OCR pipeline (lazy load model)
            ocr = get_paddle_ocr(self.lang, self.use_gpu)
            img_array = np.array(img)

            # Use version-specific API
            major_version = _get_paddle_major_version()
            if major_version >= PADDLEOCR_V3_MAJOR:
                result = ocr.predict(img_array)
            else:
                result = ocr.ocr(img_array, cls=True)

            text_blocks = self._adapter.convert_result(result, major_version)

            result_page = Page(
                page=page,
                width=float(width),
                height=float(height),
                texts=text_blocks,
            )

            # Convert from native PIXELS to output_unit
            result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
            return PageExtractionResult(page=result_page, success=True)

        except Exception as e:
            logger.warning("Failed to extract page %d: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def extract_tables(
        self,
        pages: list[int] | None = None,
    ) -> list[Table]:
        """Extract tables from document using PPStructure.

        Args:
            pages: List of page numbers to extract (0-indexed).
                   If None, extracts from all pages.

        Returns:
            List of Table objects with page field indicating source page.
        """
        import numpy as np

        if pages is None:
            pages = list(range(self.get_page_count()))

        all_tables: list[Table] = []
        engine = get_ppstructure(self.lang, self.use_gpu)

        for page_num in pages:
            try:
                img = self._images.get_page(page_num)
                img_array = np.array(img)

                # PPStructure returns list of layout elements
                result = engine(img_array)

                for element in result:
                    if element.get("type") == "table":
                        table = self._adapter.convert_table_result(element, page=page_num)
                        all_tables.append(table)

            except Exception as e:
                logger.warning("Failed to extract tables from page %d: %s", page_num, e)

        return all_tables

    def get_extractor_metadata(self) -> ExtractorMetadata:
        """Return extractor metadata."""
        extra = {"ocr_engine": "paddleocr", "languages": self.lang}
        if self._images.is_pdf:
            extra["dpi"] = self.dpi
        return ExtractorMetadata(
            extractor_type=ExtractorType.PADDLE,
            extra=extra,
        )

    def get_init_params(self) -> dict[str, Any]:
        """Return parameters for recreating this extractor in a worker process."""
        return {
            "path": self.path,
            "lang": self.lang,
            "use_gpu": self.use_gpu,
            "dpi": self.dpi,
            "output_unit": self.output_unit,
        }

    def close(self) -> None:
        """Release resources."""
        self._images.close()

init ¶

__init__(
    path: Path | str,
    lang: str = "en",
    use_gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize PaddleOCR extractor.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to the image or PDF file (Path object or string).	required
`lang`	`str`	Language code for OCR. Common values: - "en" for English - "ch" for Chinese - "fr" for French - "german" for German - "japan" for Japanese - "korean" for Korean See PaddleOCR docs for full list.	`'en'`
`use_gpu`	`bool`	Whether to use GPU acceleration.	`False`
`dpi`	`int`	DPI for PDF-to-image conversion. Default 200.	`200`
`output_unit`	`CoordinateUnit`	Coordinate unit for output. Default POINTS.	`POINTS`

Source code in unifex/ocr/extractors/paddle_ocr.py

def __init__(
    self,
    path: Path | str,
    lang: str = "en",
    use_gpu: bool = False,
    dpi: int = 200,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize PaddleOCR extractor.

    Args:
        path: Path to the image or PDF file (Path object or string).
        lang: Language code for OCR. Common values:
              - "en" for English
              - "ch" for Chinese
              - "fr" for French
              - "german" for German
              - "japan" for Japanese
              - "korean" for Korean
              See PaddleOCR docs for full list.
        use_gpu: Whether to use GPU acceleration.
        dpi: DPI for PDF-to-image conversion. Default 200.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_paddleocr_installed()
    super().__init__(path, output_unit)
    self.lang = lang
    self.use_gpu = use_gpu
    self.dpi = dpi

    # Compose components (lazy - OCR loaded on first use)
    self._images = ImageLoader(self.path, dpi)
    self._adapter = PaddleOCRAdapter()

get_page_count ¶

get_page_count() -> int

Return number of pages/images loaded.

Source code in unifex/ocr/extractors/paddle_ocr.py

def get_page_count(self) -> int:
    """Return number of pages/images loaded."""
    return self._images.page_count

extract_page ¶

extract_page(page: int) -> PageExtractionResult

Extract text from a single image/page.

Source code in unifex/ocr/extractors/paddle_ocr.py

def extract_page(self, page: int) -> PageExtractionResult:
    """Extract text from a single image/page."""
    import numpy as np

    try:
        img = self._images.get_page(page)
        width, height = img.size

        # Run OCR pipeline (lazy load model)
        ocr = get_paddle_ocr(self.lang, self.use_gpu)
        img_array = np.array(img)

        # Use version-specific API
        major_version = _get_paddle_major_version()
        if major_version >= PADDLEOCR_V3_MAJOR:
            result = ocr.predict(img_array)
        else:
            result = ocr.ocr(img_array, cls=True)

        text_blocks = self._adapter.convert_result(result, major_version)

        result_page = Page(
            page=page,
            width=float(width),
            height=float(height),
            texts=text_blocks,
        )

        # Convert from native PIXELS to output_unit
        result_page = self._convert_page(result_page, CoordinateUnit.PIXELS, self.dpi)
        return PageExtractionResult(page=result_page, success=True)

    except Exception as e:
        logger.warning("Failed to extract page %d: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

extract_tables ¶

extract_tables(
    pages: list[int] | None = None,
) -> list[Table]

Extract tables from document using PPStructure.

Parameters:

Name	Type	Description	Default
`pages`	`list[int] \| None`	List of page numbers to extract (0-indexed). If None, extracts from all pages.	`None`

Returns:

Type	Description
`list[Table]`	List of Table objects with page field indicating source page.

Source code in unifex/ocr/extractors/paddle_ocr.py

def extract_tables(
    self,
    pages: list[int] | None = None,
) -> list[Table]:
    """Extract tables from document using PPStructure.

    Args:
        pages: List of page numbers to extract (0-indexed).
               If None, extracts from all pages.

    Returns:
        List of Table objects with page field indicating source page.
    """
    import numpy as np

    if pages is None:
        pages = list(range(self.get_page_count()))

    all_tables: list[Table] = []
    engine = get_ppstructure(self.lang, self.use_gpu)

    for page_num in pages:
        try:
            img = self._images.get_page(page_num)
            img_array = np.array(img)

            # PPStructure returns list of layout elements
            result = engine(img_array)

            for element in result:
                if element.get("type") == "table":
                    table = self._adapter.convert_table_result(element, page=page_num)
                    all_tables.append(table)

        except Exception as e:
            logger.warning("Failed to extract tables from page %d: %s", page_num, e)

    return all_tables

get_extractor_metadata ¶

get_extractor_metadata() -> ExtractorMetadata

Return extractor metadata.

Source code in unifex/ocr/extractors/paddle_ocr.py

def get_extractor_metadata(self) -> ExtractorMetadata:
    """Return extractor metadata."""
    extra = {"ocr_engine": "paddleocr", "languages": self.lang}
    if self._images.is_pdf:
        extra["dpi"] = self.dpi
    return ExtractorMetadata(
        extractor_type=ExtractorType.PADDLE,
        extra=extra,
    )

get_init_params ¶

get_init_params() -> dict[str, Any]

Return parameters for recreating this extractor in a worker process.

Source code in unifex/ocr/extractors/paddle_ocr.py

def get_init_params(self) -> dict[str, Any]:
    """Return parameters for recreating this extractor in a worker process."""
    return {
        "path": self.path,
        "lang": self.lang,
        "use_gpu": self.use_gpu,
        "dpi": self.dpi,
        "output_unit": self.output_unit,
    }

close ¶

close() -> None

Release resources.

Source code in unifex/ocr/extractors/paddle_ocr.py

def close(self) -> None:
    """Release resources."""
    self._images.close()

Cloud OCR Extractors¶

AzureDocumentIntelligenceExtractor¶

Azure Document Intelligence OCR.

unifex.ocr.extractors.azure_di.AzureDocumentIntelligenceExtractor ¶

Bases: BaseExtractor

Extract text from documents using Azure Document Intelligence.

Source code in unifex/ocr/extractors/azure_di.py

class AzureDocumentIntelligenceExtractor(BaseExtractor):
    """Extract text from documents using Azure Document Intelligence."""

    def __init__(
        self,
        path: Path | str,
        endpoint: str,
        key: str,
        model_id: str = "prebuilt-read",
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        _check_azure_installed()
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.core.credentials import AzureKeyCredential

        super().__init__(path, output_unit)
        self.endpoint = endpoint
        self.model_id = model_id
        self._client: DocumentIntelligenceClient = DocumentIntelligenceClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(key),
        )
        self._result: Any | None = None
        self._adapter: AzureDocumentIntelligenceAdapter | None = None
        self._analyze_document()

    def _analyze_document(self) -> None:
        """Send document to Azure DI for analysis."""
        try:
            with open(self.path, "rb") as f:
                poller = self._client.begin_analyze_document(
                    model_id=self.model_id,
                    body=f,
                    content_type="application/octet-stream",
                )
                self._result = poller.result()
                self._adapter = AzureDocumentIntelligenceAdapter(self._result, self.model_id)
        except (OSError, ValueError) as e:
            logger.warning("Failed to analyze document with Azure DI: %s", e)
            self._result = None
            self._adapter = AzureDocumentIntelligenceAdapter(None, self.model_id)

    def get_page_count(self) -> int:
        if self._adapter is None:
            return 0
        return self._adapter.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract a single page by number (0-indexed)."""
        try:
            if self._adapter is None:
                raise ValueError("Document analysis failed")

            converted_page = self._adapter.convert_page(page)
            # Convert from native INCHES to output_unit
            converted_page = self._convert_page(converted_page, CoordinateUnit.INCHES)
            return PageExtractionResult(page=converted_page, success=True)

        except (IndexError, ValueError, AttributeError) as e:
            logger.warning("Failed to extract page %d from Azure DI result: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        if self._adapter is None:
            return AzureDocumentIntelligenceAdapter(None, self.model_id).get_metadata()
        return self._adapter.get_metadata()

    def close(self) -> None:
        if self._client is not None:
            self._client.close()

extract_page ¶

extract_page(page: int) -> PageExtractionResult

Extract a single page by number (0-indexed).

Source code in unifex/ocr/extractors/azure_di.py

def extract_page(self, page: int) -> PageExtractionResult:
    """Extract a single page by number (0-indexed)."""
    try:
        if self._adapter is None:
            raise ValueError("Document analysis failed")

        converted_page = self._adapter.convert_page(page)
        # Convert from native INCHES to output_unit
        converted_page = self._convert_page(converted_page, CoordinateUnit.INCHES)
        return PageExtractionResult(page=converted_page, success=True)

    except (IndexError, ValueError, AttributeError) as e:
        logger.warning("Failed to extract page %d from Azure DI result: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

GoogleDocumentAIExtractor¶

Google Document AI OCR.

unifex.ocr.extractors.google_docai.GoogleDocumentAIExtractor ¶

Bases: BaseExtractor

Extract text from documents using Google Document AI.

Source code in unifex/ocr/extractors/google_docai.py

class GoogleDocumentAIExtractor(BaseExtractor):
    """Extract text from documents using Google Document AI."""

    def __init__(
        self,
        path: Path | str,
        processor_name: str,
        credentials_path: str,
        mime_type: str | None = None,
        output_unit: CoordinateUnit = CoordinateUnit.POINTS,
    ) -> None:
        """Initialize Google Document AI extractor.

        Args:
            path: Path to the document file.
            processor_name: Full processor resource name, e.g.,
                'projects/{project}/locations/{location}/processors/{processor_id}'
            credentials_path: Path to service account JSON credentials file.
            mime_type: Optional MIME type. If not provided, will be inferred from file extension.
            output_unit: Coordinate unit for output. Default POINTS.
        """
        _check_google_docai_installed()
        from google.cloud import documentai
        from google.oauth2 import service_account

        super().__init__(path, output_unit)
        self.processor_name = processor_name
        self.credentials_path = credentials_path
        self.mime_type = mime_type or self._infer_mime_type()

        # Create credentials from service account file
        credentials = service_account.Credentials.from_service_account_file(credentials_path)

        # Extract location from processor name for endpoint
        # Format: projects/{project}/locations/{location}/processors/{processor_id}
        location = self._extract_location_from_processor_name(processor_name)
        opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

        self._client = documentai.DocumentProcessorServiceClient(
            credentials=credentials, client_options=opts
        )
        self._document: Document | None = None
        self._adapter: GoogleDocumentAIAdapter | None = None
        self._process_document()

    def _infer_mime_type(self) -> str:
        """Infer MIME type from file extension."""
        suffix = self.path.suffix.lower()
        mime_types = {
            ".pdf": "application/pdf",
            ".png": "image/png",
            ".jpg": "image/jpeg",
            ".jpeg": "image/jpeg",
            ".tiff": "image/tiff",
            ".tif": "image/tiff",
            ".gif": "image/gif",
            ".bmp": "image/bmp",
            ".webp": "image/webp",
        }
        return mime_types.get(suffix, "application/pdf")

    @staticmethod
    def _extract_location_from_processor_name(processor_name: str) -> str:
        """Extract location from processor resource name."""
        # Format: projects/{project}/locations/{location}/processors/{processor_id}
        parts = processor_name.split("/")
        try:
            loc_index = parts.index("locations")
            return parts[loc_index + 1]
        except (ValueError, IndexError):
            return "us"  # Default to US

    def _process_document(self) -> None:
        """Send document to Google Document AI for processing."""
        from google.cloud import documentai

        try:
            with open(self.path, "rb") as f:
                content = f.read()

            raw_document = documentai.RawDocument(
                content=content,
                mime_type=self.mime_type,
            )

            request = documentai.ProcessRequest(
                name=self.processor_name,
                raw_document=raw_document,
            )

            result = self._client.process_document(request=request)
            self._document = result.document
            self._adapter = GoogleDocumentAIAdapter(self._document, self.processor_name)

        except (OSError, ValueError, Exception) as e:
            logger.warning("Failed to process document with Google Document AI: %s", e)
            self._document = None
            self._adapter = GoogleDocumentAIAdapter(None, self.processor_name)

    def get_page_count(self) -> int:
        if self._adapter is None:
            return 0
        return self._adapter.page_count

    def extract_page(self, page: int) -> PageExtractionResult:
        """Extract a single page by number (0-indexed)."""
        try:
            if self._adapter is None:
                raise ValueError("Document processing failed")

            converted_page = self._adapter.convert_page(page)
            # Google DocAI outputs pixels after denormalization
            # Use 72 DPI as standard PDF resolution for conversion
            converted_page = self._convert_page(converted_page, CoordinateUnit.PIXELS, dpi=72.0)
            return PageExtractionResult(page=converted_page, success=True)

        except (IndexError, ValueError, AttributeError) as e:
            logger.warning("Failed to extract page %d from Google Document AI result: %s", page, e)
            return PageExtractionResult(
                page=Page(page=page, width=0, height=0, texts=[]),
                success=False,
                error=str(e),
            )

    def get_extractor_metadata(self) -> ExtractorMetadata:
        if self._adapter is None:
            return GoogleDocumentAIAdapter(None, self.processor_name).get_metadata()
        return self._adapter.get_metadata()

    def close(self) -> None:
        if self._client is not None:
            self._client.transport.close()

init ¶

__init__(
    path: Path | str,
    processor_name: str,
    credentials_path: str,
    mime_type: str | None = None,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None

Initialize Google Document AI extractor.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to the document file.	required
`processor_name`	`str`	Full processor resource name, e.g., 'projects/{project}/locations/{location}/processors/{processor_id}'	required
`credentials_path`	`str`	Path to service account JSON credentials file.	required
`mime_type`	`str \| None`	Optional MIME type. If not provided, will be inferred from file extension.	`None`
`output_unit`	`CoordinateUnit`	Coordinate unit for output. Default POINTS.	`POINTS`

Source code in unifex/ocr/extractors/google_docai.py

def __init__(
    self,
    path: Path | str,
    processor_name: str,
    credentials_path: str,
    mime_type: str | None = None,
    output_unit: CoordinateUnit = CoordinateUnit.POINTS,
) -> None:
    """Initialize Google Document AI extractor.

    Args:
        path: Path to the document file.
        processor_name: Full processor resource name, e.g.,
            'projects/{project}/locations/{location}/processors/{processor_id}'
        credentials_path: Path to service account JSON credentials file.
        mime_type: Optional MIME type. If not provided, will be inferred from file extension.
        output_unit: Coordinate unit for output. Default POINTS.
    """
    _check_google_docai_installed()
    from google.cloud import documentai
    from google.oauth2 import service_account

    super().__init__(path, output_unit)
    self.processor_name = processor_name
    self.credentials_path = credentials_path
    self.mime_type = mime_type or self._infer_mime_type()

    # Create credentials from service account file
    credentials = service_account.Credentials.from_service_account_file(credentials_path)

    # Extract location from processor name for endpoint
    # Format: projects/{project}/locations/{location}/processors/{processor_id}
    location = self._extract_location_from_processor_name(processor_name)
    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    self._client = documentai.DocumentProcessorServiceClient(
        credentials=credentials, client_options=opts
    )
    self._document: Document | None = None
    self._adapter: GoogleDocumentAIAdapter | None = None
    self._process_document()

extract_page ¶

extract_page(page: int) -> PageExtractionResult

Extract a single page by number (0-indexed).

Source code in unifex/ocr/extractors/google_docai.py

def extract_page(self, page: int) -> PageExtractionResult:
    """Extract a single page by number (0-indexed)."""
    try:
        if self._adapter is None:
            raise ValueError("Document processing failed")

        converted_page = self._adapter.convert_page(page)
        # Google DocAI outputs pixels after denormalization
        # Use 72 DPI as standard PDF resolution for conversion
        converted_page = self._convert_page(converted_page, CoordinateUnit.PIXELS, dpi=72.0)
        return PageExtractionResult(page=converted_page, success=True)

    except (IndexError, ValueError, AttributeError) as e:
        logger.warning("Failed to extract page %d from Google Document AI result: %s", page, e)
        return PageExtractionResult(
            page=Page(page=page, width=0, height=0, texts=[]),
            success=False,
            error=str(e),
        )

LLM Extractors¶

extract_structured¶

Synchronous LLM extraction function.

unifex.llm_factory.extract_structured ¶

extract_structured(
    path: Path | str,
    model: str,
    *,
    schema: type[T],
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[T]

extract_structured(
    path: Path | str,
    model: str,
    *,
    schema: None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[dict[str, Any]]

extract_structured(
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: SingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]

Extract structured data from a document using an LLM.

All specified pages are sent in a single request.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to document/image file.	required
`model`	`str`	Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").	required
`schema`	`type[T] \| None`	Pydantic model for structured output. None for free-form dict.	`None`
`prompt`	`str \| None`	Custom extraction prompt. Auto-generated from schema if None.	`None`
`pages`	`list[int] \| None`	Page numbers to extract from (0-indexed). None for all pages.	`None`
`dpi`	`int`	DPI for PDF-to-image conversion.	`200`
`max_retries`	`int`	Max retry attempts with validation feedback.	`3`
`temperature`	`float`	Sampling temperature (0.0 = deterministic).	`0.0`
`credentials`	`dict[str, str] \| None`	Override credentials dict (otherwise uses env vars).	`None`
`base_url`	`str \| None`	Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).	`None`
`headers`	`dict[str, str] \| None`	Custom HTTP headers for OpenAI-compatible APIs.	`None`
`_extractor`	`SingleExtractor[T] \| None`	Internal parameter for dependency injection (testing only).	`None`

Returns:

Type	Description
`LLMExtractionResult[T \| dict[str, Any]]`	LLMExtractionResult containing extracted data,
`LLMExtractionResult[T \| dict[str, Any]]`	model info, and provider.

Source code in unifex/llm_factory.py

def extract_structured(  # noqa: PLR0913
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: SingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]:
    """Extract structured data from a document using an LLM.

    All specified pages are sent in a single request.

    Args:
        path: Path to document/image file.
        model: Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").
        schema: Pydantic model for structured output. None for free-form dict.
        prompt: Custom extraction prompt. Auto-generated from schema if None.
        pages: Page numbers to extract from (0-indexed). None for all pages.
        dpi: DPI for PDF-to-image conversion.
        max_retries: Max retry attempts with validation feedback.
        temperature: Sampling temperature (0.0 = deterministic).
        credentials: Override credentials dict (otherwise uses env vars).
        base_url: Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).
        headers: Custom HTTP headers for OpenAI-compatible APIs.
        _extractor: Internal parameter for dependency injection (testing only).

    Returns:
        [LLMExtractionResult][unifex.llm.models.LLMExtractionResult] containing extracted data,
        model info, and provider.
    """
    path = Path(path) if isinstance(path, str) else path
    extractor = _extractor or _extract_single

    return extractor(
        path,
        model,
        schema,
        prompt,
        pages,
        dpi,
        max_retries,
        temperature,
        credentials,
        base_url,
        headers,
    )

extract_structured_async¶

Asynchronous LLM extraction function.

unifex.llm_factory.extract_structured_async `async` ¶

extract_structured_async(
    path: Path | str,
    model: str,
    *,
    schema: type[T],
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[T]

extract_structured_async(
    path: Path | str,
    model: str,
    *,
    schema: None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: Any = None,
) -> LLMExtractionResult[dict[str, Any]]

extract_structured_async(
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: AsyncSingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]

Async version of extract_structured.

All specified pages are sent in a single request.

Parameters:

Name	Type	Description	Default
`path`	`Path \| str`	Path to document/image file.	required
`model`	`str`	Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").	required
`schema`	`type[T] \| None`	Pydantic model for structured output. None for free-form dict.	`None`
`prompt`	`str \| None`	Custom extraction prompt. Auto-generated from schema if None.	`None`
`pages`	`list[int] \| None`	Page numbers to extract from (0-indexed). None for all pages.	`None`
`dpi`	`int`	DPI for PDF-to-image conversion.	`200`
`max_retries`	`int`	Max retry attempts with validation feedback.	`3`
`temperature`	`float`	Sampling temperature (0.0 = deterministic).	`0.0`
`credentials`	`dict[str, str] \| None`	Override credentials dict (otherwise uses env vars).	`None`
`base_url`	`str \| None`	Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).	`None`
`headers`	`dict[str, str] \| None`	Custom HTTP headers for OpenAI-compatible APIs.	`None`
`_extractor`	`AsyncSingleExtractor[T] \| None`	Internal parameter for dependency injection (testing only).	`None`

Returns:

Type	Description
`LLMExtractionResult[T \| dict[str, Any]]`	LLMExtractionResult containing extracted data,
`LLMExtractionResult[T \| dict[str, Any]]`	model info, and provider.

Source code in unifex/llm_factory.py

async def extract_structured_async(  # noqa: PLR0913
    path: Path | str,
    model: str,
    *,
    schema: type[T] | None = None,
    prompt: str | None = None,
    pages: list[int] | None = None,
    dpi: int = 200,
    max_retries: int = 3,
    temperature: float = 0.0,
    credentials: dict[str, str] | None = None,
    base_url: str | None = None,
    headers: dict[str, str] | None = None,
    _extractor: AsyncSingleExtractor[T] | None = None,
) -> LLMExtractionResult[T | dict[str, Any]]:
    """Async version of extract_structured.

    All specified pages are sent in a single request.

    Args:
        path: Path to document/image file.
        model: Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3-5-sonnet").
        schema: Pydantic model for structured output. None for free-form dict.
        prompt: Custom extraction prompt. Auto-generated from schema if None.
        pages: Page numbers to extract from (0-indexed). None for all pages.
        dpi: DPI for PDF-to-image conversion.
        max_retries: Max retry attempts with validation feedback.
        temperature: Sampling temperature (0.0 = deterministic).
        credentials: Override credentials dict (otherwise uses env vars).
        base_url: Custom API base URL for OpenAI-compatible APIs (vLLM, Ollama, etc.).
        headers: Custom HTTP headers for OpenAI-compatible APIs.
        _extractor: Internal parameter for dependency injection (testing only).

    Returns:
        [LLMExtractionResult][unifex.llm.models.LLMExtractionResult] containing extracted data,
        model info, and provider.
    """
    path = Path(path) if isinstance(path, str) else path
    extractor = _extractor or _extract_single_async

    return await extractor(
        path,
        model,
        schema,
        prompt,
        pages,
        dpi,
        max_retries,
        temperature,
        credentials,
        base_url,
        headers,
    )

Extractors Reference¶

PDF Extractor¶

PdfExtractor¶

unifex.pdf.PdfExtractor ¶

extract_page ¶

extract_tables ¶

Local OCR Extractors¶

EasyOcrExtractor¶

unifex.ocr.extractors.easy_ocr.EasyOcrExtractor ¶

__init__ ¶

get_page_count ¶

extract_page ¶

get_extractor_metadata ¶

get_init_params ¶

close ¶

TesseractOcrExtractor¶

unifex.ocr.extractors.tesseract_ocr.TesseractOcrExtractor ¶

__init__ ¶

get_page_count ¶

extract_page ¶

get_extractor_metadata ¶

get_init_params ¶

close ¶

PaddleOcrExtractor¶

unifex.ocr.extractors.paddle_ocr.PaddleOcrExtractor ¶

__init__ ¶

get_page_count ¶

extract_page ¶

extract_tables ¶

get_extractor_metadata ¶

get_init_params ¶

close ¶

Cloud OCR Extractors¶

AzureDocumentIntelligenceExtractor¶

unifex.ocr.extractors.azure_di.AzureDocumentIntelligenceExtractor ¶

extract_page ¶

GoogleDocumentAIExtractor¶

unifex.ocr.extractors.google_docai.GoogleDocumentAIExtractor ¶

__init__ ¶

extract_page ¶

LLM Extractors¶

extract_structured¶

unifex.llm_factory.extract_structured ¶

extract_structured_async¶

unifex.llm_factory.extract_structured_async async ¶

init ¶

init ¶

init ¶

init ¶

unifex.llm_factory.extract_structured_async `async` ¶