researchqa/backend/parser/pdf/MainParser.py

import logging
from typing import List, Dict, Any
from docling.document_converter import DocumentConverter

# 로깅 설정
logger = logging.getLogger(__name__)

class PDFParser:
    """PDF 파일을 파싱하는 클래스 (docling 사용)"""

    def __init__(self):
        self.chunk_size = 1000
        # docling 변환기 초기화 (OCR 없이)
        self.converter = DocumentConverter()

    def extract_text_from_pdf(self, file_path: str) -> tuple[str, list]:
        """
        PDF 파일에서 텍스트와 페이지 정보를 추출합니다. (docling 사용)

        Args:
            file_path (str): PDF 파일 경로

        Returns:
            tuple[str, list]: (추출된 텍스트, 페이지별 텍스트 리스트)
        """
        try:
            logger.info(f"Docling으로 PDF 파싱 시작: {file_path}")

            # docling을 사용하여 PDF 변환
            result = self.converter.convert(file_path)
            document = result.document

            # docling의 export_to_text() 메서드 사용
            text_content = document.export_to_text()

            # 페이지별 텍스트 추출
            page_texts = []
            if hasattr(document, 'pages') and document.pages:
                for page in document.pages:
                    if hasattr(page, 'export_to_text'):
                        page_text = page.export_to_text()
                        page_texts.append(page_text)
                    else:
                        page_texts.append("")
            else:
                # 페이지 정보가 없는 경우 전체 텍스트를 첫 페이지로 처리
                page_texts = [text_content]

            logger.info(f"PDF 텍스트 추출 완료 (docling): {file_path}, 텍스트 길이: {len(text_content)}, 페이지 수: {len(page_texts)}")
            return text_content, page_texts

        except Exception as e:
            logger.error(f"PDF 텍스트 추출 실패: {file_path}, 오류: {e}")
            # docling 실패 시 빈 텍스트 반환
            logger.warning(f"Docling 파싱 실패, 빈 텍스트로 처리: {e}")
            return "", [""]

    def chunk_text(self, text: str) -> List[str]:
        """
        텍스트를 청크로 분할합니다.

        Args:
            text (str): 분할할 텍스트

        Returns:
            List[str]: 청크 리스트
        """
        if not text.strip():
            return []

        chunks = []
        for i in range(0, len(text), self.chunk_size):
            chunk = text[i:i+self.chunk_size]
            if chunk.strip():  # 빈 청크 제외
                chunks.append(chunk)

        logger.info(f"텍스트 청크 분할 완료: {len(chunks)}개 청크")
        return chunks

    def process_pdf(self, file_path: str) -> Dict[str, Any]:
        """
        PDF 파일을 처리하여 텍스트와 청크를 반환합니다.

        Args:
            file_path (str): PDF 파일 경로

        Returns:
            Dict[str, Any]: 처리 결과
        """
        try:
            # 텍스트 및 페이지 정보 추출
            text_content, page_texts = self.extract_text_from_pdf(file_path)

            # 청크 분할
            chunks = self.chunk_text(text_content)

            return {
                "text_content": text_content,
                "chunks": chunks,
                "chunk_count": len(chunks),
                "page_texts": page_texts,
                "page_count": len(page_texts),
                "success": True
            }

        except Exception as e:
            logger.error(f"PDF 처리 실패: {file_path}, 오류: {e}")
            return {
                "text_content": "",
                "chunks": [],
                "chunk_count": 0,
                "page_texts": [],
                "page_count": 0,
                "success": False,
                "error": str(e)
            }

def process(input_path: str) -> str:
    """
    PDF 파일을 처리하는 메인 함수 (기존 인터페이스 유지)

    Args:
        input_path (str): PDF 파일 경로

    Returns:
        str: 추출된 텍스트
    """
    parser = PDFParser()
    result = parser.process_pdf(input_path)

    if result["success"]:
        return result["text_content"]
    else:
        logger.error(f"PDF 처리 실패: {result.get('error', 'Unknown error')}")
        return ""

if __name__ == '__main__':
    # 테스트 코드
    input_file = 'a.pdf'
    result = process(input_file)
    print(f"추출된 텍스트 길이: {len(result)}")
    print(f"텍스트 미리보기: {result[:200]}...")