init

2025-10-13 08:29:41 +09:00 · 2025-10-13 08:29:41 +09:00 · bc4aacea62
commit bc4aacea62
parent d7d57b0327
4 changed files with 412 additions and 27 deletions
--- a/backend/services/context_retrieval.py
+++ b/backend/services/context_retrieval.py
@ -0,0 +1,266 @@
 """
 컨텍스트 기반 검색 시스템
 - 컨텍스트 임베딩: 질문과 문서를 함께 임베딩하여 더 정확한 검색
 - 컨텍스트 BM25: 질문과 문서의 컨텍스트를 고려한 키워드 검색
 - Reranker: 검색 결과를 재순위화하여 정확도 향상
 """
 import logging
 import numpy as np
 from typing import List, Dict, Any, Tuple
 from rank_bm25 import BM25Okapi
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import re
 from collections import Counter
 logger = logging.getLogger(__name__)
 class ContextEmbedding:
    """컨텍스트 임베딩을 통한 검색"""
    def __init__(self, model_name: str = "jhgan/ko-sroberta-multitask"):
        self.model = SentenceTransformer(model_name)
        logger.info(f"✅ 컨텍스트 임베딩 모델 로드 완료: {model_name}")
    def create_context_embedding(self, question: str, document: str) -> np.ndarray:
        """질문과 문서를 함께 임베딩하여 컨텍스트 임베딩 생성"""
        # 질문과 문서를 결합하여 컨텍스트 생성
        context = f"질문: {question}\n문서: {document}"
        embedding = self.model.encode(context)
        return embedding
    def search_with_context(self, question: str, documents: List[Dict[str, Any]], top_k: int = 10) -> List[Dict[str, Any]]:
        """컨텍스트 임베딩을 사용한 검색"""
        logger.info(f"🔍 컨텍스트 임베딩 검색 시작: {len(documents)}개 문서")
        # 질문 임베딩 생성
        question_embedding = self.model.encode(question)
        # 각 문서에 대해 컨텍스트 임베딩 생성 및 유사도 계산
        scored_documents = []
        for doc in documents:
            doc_content = doc.get('content', '')
            if not doc_content:
                continue
            # 컨텍스트 임베딩 생성
            context_embedding = self.create_context_embedding(question, doc_content)
            # 코사인 유사도 계산
            similarity = cosine_similarity([question_embedding], [context_embedding])[0][0]
            scored_documents.append({
                'document': doc,
                'context_score': similarity,
                'content': doc_content
            })
        # 유사도 기준으로 정렬
        scored_documents.sort(key=lambda x: x['context_score'], reverse=True)
        logger.info(f"📊 컨텍스트 임베딩 검색 완료: {len(scored_documents)}개 결과")
        return scored_documents[:top_k]
 class ContextBM25:
    """컨텍스트 BM25를 통한 검색"""
    def __init__(self):
        self.bm25 = None
        self.documents = []
        logger.info("✅ 컨텍스트 BM25 초기화 완료")
    def preprocess_text(self, text: str) -> List[str]:
        """텍스트 전처리 및 토큰화"""
        # 한글, 영문, 숫자만 추출
        text = re.sub(r'[^\w\s가-힣]', ' ', text)
        # 공백으로 분리
        tokens = text.split()
        # 빈 토큰 제거
        tokens = [token.strip() for token in tokens if token.strip()]
        return tokens
    def build_index(self, documents: List[Dict[str, Any]]):
        """BM25 인덱스 구축"""
        self.documents = documents
        corpus = []
        for doc in documents:
            content = doc.get('content', '')
            tokens = self.preprocess_text(content)
            corpus.append(tokens)
        self.bm25 = BM25Okapi(corpus)
        logger.info(f"📚 BM25 인덱스 구축 완료: {len(corpus)}개 문서")
    def search_with_context(self, question: str, top_k: int = 10) -> List[Dict[str, Any]]:
        """컨텍스트 BM25를 사용한 검색"""
        if not self.bm25:
            logger.warning("⚠️ BM25 인덱스가 구축되지 않았습니다.")
            return []
        logger.info(f"🔍 컨텍스트 BM25 검색 시작: {question}")
        # 질문 토큰화
        question_tokens = self.preprocess_text(question)
        # BM25 점수 계산
        scores = self.bm25.get_scores(question_tokens)
        # 점수와 문서를 매핑
        scored_documents = []
        for i, (doc, score) in enumerate(zip(self.documents, scores)):
            scored_documents.append({
                'document': doc,
                'bm25_score': score,
                'content': doc.get('content', '')
            })
        # 점수 기준으로 정렬
        scored_documents.sort(key=lambda x: x['bm25_score'], reverse=True)
        logger.info(f"📊 컨텍스트 BM25 검색 완료: {len(scored_documents)}개 결과")
        return scored_documents[:top_k]
 class Reranker:
    """검색 결과 재순위화"""
    def __init__(self, model_name: str = "jhgan/ko-sroberta-multitask"):
        self.model = SentenceTransformer(model_name)
        logger.info(f"✅ Reranker 모델 로드 완료: {model_name}")
    def calculate_relevance_score(self, question: str, document_content: str) -> float:
        """질문과 문서의 관련성 점수 계산"""
        # 질문과 문서의 임베딩 생성
        question_embedding = self.model.encode(question)
        doc_embedding = self.model.encode(document_content)
        # 코사인 유사도 계산
        similarity = cosine_similarity([question_embedding], [doc_embedding])[0][0]
        # 키워드 매칭 점수 추가
        keyword_score = self._calculate_keyword_score(question, document_content)
        # 최종 점수 (임베딩 유사도 70% + 키워드 매칭 30%)
        final_score = 0.7 * similarity + 0.3 * keyword_score
        return final_score
    def _calculate_keyword_score(self, question: str, document: str) -> float:
        """키워드 매칭 점수 계산"""
        # 질문에서 키워드 추출
        question_tokens = re.findall(r'\b\w+\b', question.lower())
        doc_tokens = re.findall(r'\b\w+\b', document.lower())
        # 토큰 빈도 계산
        question_counter = Counter(question_tokens)
        doc_counter = Counter(doc_tokens)
        # 공통 토큰의 가중치 계산
        common_tokens = set(question_tokens) & set(doc_tokens)
        if not common_tokens:
            return 0.0
        # TF-IDF 스타일 점수 계산
        total_score = 0.0
        for token in common_tokens:
            question_freq = question_counter[token]
            doc_freq = doc_counter[token]
            # 간단한 TF-IDF 스타일 점수
            score = (question_freq * doc_freq) / (len(question_tokens) * len(doc_tokens))
            total_score += score
        return min(total_score, 1.0)  # 최대 1.0으로 제한
    def rerank_documents(self, question: str, documents: List[Dict[str, Any]], top_k: int = 10) -> List[Dict[str, Any]]:
        """문서 재순위화"""
        logger.info(f"🔄 Reranker 재순위화 시작: {len(documents)}개 문서")
        reranked_documents = []
        for doc_info in documents:
            content = doc_info.get('content', '')
            if not content:
                continue
            # 관련성 점수 계산
            relevance_score = self.calculate_relevance_score(question, content)
            # 기존 점수와 재순위화 점수 결합
            original_score = doc_info.get('context_score', 0) + doc_info.get('bm25_score', 0)
            final_score = 0.6 * relevance_score + 0.4 * original_score
            reranked_documents.append({
                **doc_info,
                'rerank_score': relevance_score,
                'final_score': final_score
            })
        # 최종 점수 기준으로 정렬
        reranked_documents.sort(key=lambda x: x['final_score'], reverse=True)
        logger.info(f"📊 Reranker 재순위화 완료: {len(reranked_documents)}개 결과")
        return reranked_documents[:top_k]
 class ContextRetrieval:
    """컨텍스트 기반 검색 시스템 통합 클래스"""
    def __init__(self, model_name: str = "jhgan/ko-sroberta-multitask"):
        self.context_embedding = ContextEmbedding(model_name)
        self.context_bm25 = ContextBM25()
        self.reranker = Reranker(model_name)
        logger.info("✅ 컨텍스트 검색 시스템 초기화 완료")
    def build_index(self, documents: List[Dict[str, Any]]):
        """검색 인덱스 구축"""
        logger.info(f"📚 검색 인덱스 구축 시작: {len(documents)}개 문서")
        self.context_bm25.build_index(documents)
        logger.info("✅ 검색 인덱스 구축 완료")
    def search(self, question: str, top_k: int = 10) -> List[Dict[str, Any]]:
        """컨텍스트 기반 통합 검색"""
        logger.info(f"🔍 컨텍스트 기반 통합 검색 시작: {question}")
        # 1. 컨텍스트 임베딩 검색
        embedding_results = self.context_embedding.search_with_context(question, self.context_bm25.documents, top_k * 2)
        # 2. 컨텍스트 BM25 검색
        bm25_results = self.context_bm25.search_with_context(question, top_k * 2)
        # 3. 두 결과를 결합하여 중복 제거
        combined_results = self._combine_results(embedding_results, bm25_results)
        # 4. Reranker로 재순위화
        final_results = self.reranker.rerank_documents(question, combined_results, top_k)
        logger.info(f"📊 컨텍스트 기반 통합 검색 완료: {len(final_results)}개 결과")
        return final_results
    def _combine_results(self, embedding_results: List[Dict], bm25_results: List[Dict]) -> List[Dict]:
        """임베딩과 BM25 결과를 결합하여 중복 제거"""
        combined = {}
        # 임베딩 결과 추가
        for result in embedding_results:
            doc_id = id(result['document'])
            if doc_id not in combined:
                combined[doc_id] = result
            else:
                # 기존 점수와 새 점수 결합
                combined[doc_id]['context_score'] = max(
                    combined[doc_id].get('context_score', 0),
                    result['context_score']
                )
        # BM25 결과 추가
        for result in bm25_results:
            doc_id = id(result['document'])
            if doc_id not in combined:
                combined[doc_id] = result
            else:
                # 기존 점수와 새 점수 결합
                combined[doc_id]['bm25_score'] = max(
                    combined[doc_id].get('bm25_score', 0),
                    result['bm25_score']
                )
        return list(combined.values())
--- a/backend/services/langchain_service.py
+++ b/backend/services/langchain_service.py
@ -6,7 +6,6 @@ LangChain v0.3 기반 AI 서비스
 import os
 import logging
 from typing import List, Dict, Any, Optional
 from datetime import datetime
 from .context_retrieval import ContextRetrieval
 # LangChain Core
@ -16,8 +15,6 @@ from langchain_core.vectorstores import VectorStore
 from langchain_core.retrievers import BaseRetriever
 from langchain_core.language_models import BaseLanguageModel
 from langchain_core.prompts import PromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough, RunnableParallel
 # LangChain Community
 from langchain_community.vectorstores import Chroma
@ -25,7 +22,6 @@ from langchain_community.embeddings import SentenceTransformerEmbeddings
 from langchain_community.llms import Ollama
 # LangChain Chains
 from langchain.chains import RetrievalQA
 from langchain.chains.combine_documents import create_stuff_documents_chain
 from langchain.chains import create_retrieval_chain
@ -696,29 +692,6 @@ class LangChainRAGService:
        return max(0.0, score)  # 음수 점수 방지
    def _verify_content_relevance(self, content: str, question: str) -> bool:
        """문서 내용이 질문과 실제로 관련이 있는지 검증 (간소화)"""
        if not content or not question:
            return False
        content_lower = content.lower()
        question_keywords = self._extract_keywords(question)
        # 명백히 관련 없는 페이지 패턴들
        irrelevant_patterns = [
            '개정번호', '개정항목', '개정사유', '발효일자', '신규제정',
            '바인더 표지', '보관번호', '보관목록', '관리부서',
            'prepared by', 'format no', 'storage number'
        ]
        if any(pattern in content_lower for pattern in irrelevant_patterns):
            return False
        # 키워드 매칭 확인 (완화된 버전)
        keyword_matches = sum(1 for keyword in question_keywords if keyword in content_lower)
        # 최소 1개 이상의 키워드가 매칭되면 관련성이 있다고 판단 (2→1로 완화)
        return keyword_matches >= 1
    def _verify_answer_document_relevance(self, answer: str, doc_content: str, question: str) -> bool:
        """답변과 문서 내용의 직접적 연관성 검증"""
--- a/frontend/public/images/chatbot_icon.svg
+++ b/frontend/public/images/chatbot_icon.svg
@ -0,0 +1,10 @@
 <svg width="32" height="32" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg">
  <!-- 노란색 원형 배경 -->
  <circle cx="16" cy="16" r="16" fill="#FFD700"/>
  <!-- 흰색 내부 원 (도넛 모양) -->
  <circle cx="16" cy="16" r="12" fill="white"/>
  <!-- 중앙에 W 문자 -->
  <text x="16" y="20" font-family="Arial, sans-serif" font-size="14" font-weight="bold" text-anchor="middle" fill="#FFD700">W</text>
 </svg>
--- a/frontend/src/components/SimpleMarkdownRenderer.tsx
+++ b/frontend/src/components/SimpleMarkdownRenderer.tsx
@ -0,0 +1,136 @@
 import React from 'react';
 import ReactMarkdown from 'react-markdown';
 import remarkGfm from 'remark-gfm';
 interface DetailedReference {
  filename: string;
  file_id: string;
  page_number: number;
  chunk_index?: number;
  content_preview?: string;
  full_content?: string;
  is_relevant?: boolean;
 }
 interface SimpleMarkdownRendererProps {
  content: string;
  detailedReferences?: DetailedReference[];
  onReferenceClick?: (fileId: string, pageNumber: number, filename: string) => void;
 }
 const SimpleMarkdownRenderer: React.FC<SimpleMarkdownRendererProps> = ({
  content,
  detailedReferences = [],
  onReferenceClick
 }) => {
  // 마크다운 렌더링 함수
  const renderMarkdown = (text: string): React.ReactNode => {
    return (
      <ReactMarkdown
        remarkPlugins={[remarkGfm]}
        components={{
          // 일반 링크 처리
          link: ({ href, children }) => (
            <a href={href} target="_blank" rel="noopener noreferrer" className="text-blue-600 hover:text-blue-800 underline">
              {children}
            </a>
          ),
          // 헤딩 스타일
          h1: ({ children }) => (
            <h1 className="text-2xl font-bold text-gray-900 mb-4 mt-6 first:mt-0">
              {children}
            </h1>
          ),
          h2: ({ children }) => (
            <h2 className="text-xl font-semibold text-gray-800 mb-3 mt-5">
              {children}
            </h2>
          ),
          h3: ({ children }) => (
            <h3 className="text-lg font-medium text-gray-700 mb-2 mt-4">
              {children}
            </h3>
          ),
          // 문단 스타일
          p: ({ children }) => (
            <p className="text-gray-700 leading-relaxed mb-4">
              {children}
            </p>
          ),
            // 리스트 스타일
            ul: ({ children }) => (
              <ul className="list-disc pl-6 text-gray-700 mb-4 space-y-2">
                {children}
              </ul>
            ),
            ol: ({ children }) => (
              <ol className="list-decimal pl-6 text-gray-700 mb-4 space-y-2">
                {children}
              </ol>
            ),
            li: ({ children }) => (
              <li className="text-gray-700 leading-relaxed ml-2">
                {children}
              </li>
            ),
          // 강조 스타일
          strong: ({ children }) => (
            <strong className="font-semibold text-gray-900">
              {children}
            </strong>
          ),
          em: ({ children }) => (
            <em className="italic text-gray-800">
              {children}
            </em>
          ),
          // 코드 스타일
          code: ({ children }) => (
            <code className="bg-gray-100 text-gray-800 px-1 py-0.5 rounded text-sm font-mono">
              {children}
            </code>
          ),
          pre: ({ children }) => (
            <pre className="bg-gray-100 text-gray-800 p-4 rounded-lg overflow-x-auto mb-4">
              {children}
            </pre>
          ),
            // 인용 스타일
            blockquote: ({ children }) => (
              <blockquote className="border-l-4 border-blue-500 pl-6 py-2 italic text-gray-600 mb-4 bg-blue-50 rounded-r">
                {children}
              </blockquote>
            ),
          // 테이블 스타일
          table: ({ children }) => (
            <div className="overflow-x-auto mb-4">
              <table className="min-w-full border-collapse border border-gray-300">
                {children}
              </table>
            </div>
          ),
          th: ({ children }) => (
            <th className="border border-gray-300 bg-gray-100 px-4 py-2 text-left font-semibold text-gray-900">
              {children}
            </th>
          ),
          td: ({ children }) => (
            <td className="border border-gray-300 px-4 py-2 text-gray-700">
              {children}
            </td>
          ),
        }}
      >
        {text}
      </ReactMarkdown>
    );
  };
  return (
    <div className="prose prose-gray max-w-none">
      {renderMarkdown(content)}
    </div>
  );
 };
 export default SimpleMarkdownRenderer;