Skip to content

Latest commit

ย 

History

History
397 lines (301 loc) ยท 11.8 KB

File metadata and controls

397 lines (301 loc) ยท 11.8 KB

05. ๊ณ ๊ธ‰ โ€” ์ˆ˜๋™ ์กฐํ•ฉ, ์ปค์Šคํ…€ ์–ด๋Œ‘ํ„ฐ, ๊ด€์ธก์„ฑ

๊ฐ ์ปดํฌ๋„ŒํŠธ๋ฅผ ๊ฐœ๋ณ„ ์ œ์–ดํ•˜๊ณ , ์ž„๋ฒ ๋”ฉยท๋ฒกํ„ฐ์Šคํ† ์–ดยท์ฒญ์ปค๋ฅผ ๊ต์ฒดํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ๋‹ค๋ฃน๋‹ˆ๋‹ค.


1) ์™„์ „ ์ˆ˜๋™ Advanced Flow

Retriever โ†’ Generator โ†’ Executor๋ฅผ ์ง์ ‘ ์กฐํ•ฉํ•ฉ๋‹ˆ๋‹ค. ๊ฐ ๋‹จ๊ณ„์˜ ์ž…์ถœ๋ ฅ์ด ์ฝ”๋“œ์— ๋ณด์—ฌ ๋””๋ฒ„๊น…๊ณผ ํŒŒ๋ผ๋ฏธํ„ฐ ํŠœ๋‹์ด ์‰ฝ์Šต๋‹ˆ๋‹ค.

from lang2sql import (
    CatalogChunker,
    DirectoryLoader,
    RecursiveCharacterChunker,
    SQLExecutor,
    SQLGenerator,
    VectorRetriever,
)
from lang2sql.integrations.db import SQLAlchemyDB
from lang2sql.integrations.embedding import OpenAIEmbedding
from lang2sql.integrations.llm import OpenAILLM

catalog = [
    {
        "name": "orders",
        "description": "์ฃผ๋ฌธ ์ •๋ณด",
        "columns": {
            "order_id":   "์ฃผ๋ฌธ ๊ณ ์œ  ID",
            "order_date": "์ฃผ๋ฌธ ์ผ์‹œ",
            "amount":     "๊ฒฐ์ œ ๊ธˆ์•ก",
            "status":     "์ฃผ๋ฌธ ์ƒํƒœ",
        },
    }
]

# 1) ๋ฌธ์„œ ๋กœ๋“œ + split
docs            = DirectoryLoader("docs/business").load()
embedding       = OpenAIEmbedding(model="text-embedding-3-small")
catalog_chunks  = CatalogChunker().split(catalog)
doc_chunks      = RecursiveCharacterChunker(chunk_size=800, chunk_overlap=80).split(docs)

# 2) Retriever ๊ตฌ์„ฑ
retriever = VectorRetriever.from_chunks(
    catalog_chunks + doc_chunks,
    embedding=embedding,
    top_n=5,
    score_threshold=0.2,
)

# 3) Generator / Executor ๊ฐœ๋ณ„ ๊ตฌ์„ฑ
generator = SQLGenerator(
    llm=OpenAILLM(model="gpt-4o-mini"),
    db_dialect="sqlite",
)
executor = SQLExecutor(db=SQLAlchemyDB("sqlite:///sample.db"))

# 4) ์ˆ˜๋™ ์‹คํ–‰ โ€” ๊ฐ ๋‹จ๊ณ„ ๊ฒฐ๊ณผ ์ง์ ‘ ๊ด€์ธก
query     = "์ง€๋‚œ๋‹ฌ ์ˆœ๋งค์ถœ ํ•ฉ๊ณ„"
retrieval = retriever.run(query)
sql       = generator.run(query, retrieval.schemas, context=retrieval.context)
rows      = executor.run(sql)

print("SQL:", sql)
print("๊ฒฐ๊ณผ:", rows)

2) ์ž„๋ฒ ๋”ฉ ๊ต์ฒด

EmbeddingPort๋ฅผ ๋งŒ์กฑํ•˜๋Š” ๊ตฌํ˜„์ฒด๋ผ๋ฉด ๋ฌด์—‡์ด๋“  ์—ฐ๊ฒฐํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

v2 ๋‚ด์žฅ ์ž„๋ฒ ๋”ฉ

from lang2sql.integrations.embedding import (
    OpenAIEmbedding,
    AzureOpenAIEmbedding,
    GeminiEmbedding,
    BedrockEmbedding,
    OllamaEmbedding,
    HuggingFaceEmbedding,
)

embedding = OpenAIEmbedding(model="text-embedding-3-small")

์ปค์Šคํ…€ ์–ด๋Œ‘ํ„ฐ ์˜ˆ์‹œ (SentenceTransformer)

class SentenceTransformerEmbedding:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        from sentence_transformers import SentenceTransformer
        self._model = SentenceTransformer(model_name)

    def embed_query(self, text: str) -> list[float]:
        return self._model.encode([text], normalize_embeddings=True)[0].tolist()

    def embed_texts(self, texts: list[str]) -> list[list[float]]:
        return self._model.encode(texts, normalize_embeddings=True).tolist()

# ํŒŒ์ดํ”„๋ผ์ธ ์ฝ”๋“œ๋Š” ๋™์ผ
retriever = VectorRetriever.from_sources(
    catalog=catalog,
    embedding=SentenceTransformerEmbedding(),
)

3) ๋ฒกํ„ฐ์Šคํ† ์–ด ๊ต์ฒด

VectorStorePort์˜ upsert()์™€ search()๋งŒ ๊ตฌํ˜„ํ•˜๋ฉด ๋ฉ๋‹ˆ๋‹ค.

์ปค์Šคํ…€ ์–ด๋Œ‘ํ„ฐ ์˜ˆ์‹œ (Chroma)

class ChromaVectorStore:
    def __init__(self, collection_name: str = "lang2sql"):
        import chromadb
        self._client = chromadb.Client()
        self._col    = self._client.get_or_create_collection(collection_name)

    def upsert(self, ids: list[str], vectors: list[list[float]]) -> None:
        self._col.upsert(ids=ids, embeddings=vectors)

    def search(self, vector: list[float], k: int) -> list[tuple[str, float]]:
        results = self._col.query(query_embeddings=[vector], n_results=k)
        ids   = results["ids"][0]
        dists = results["distances"][0]
        return [(id_, 1.0 - dist) for id_, dist in zip(ids, dists)]

retriever = VectorRetriever.from_sources(
    catalog=catalog,
    embedding=OpenAIEmbedding(),
    vectorstore=ChromaVectorStore("my_catalog"),
)

4) ์ฒญ์ปค ๊ต์ฒด

SemanticChunker (opt-in)

from lang2sql import CatalogChunker, VectorRetriever
from lang2sql.integrations.chunking import SemanticChunker
from lang2sql.integrations.embedding import OpenAIEmbedding

embedding = OpenAIEmbedding(model="text-embedding-3-small")

# from_chunks ํŒจํ„ด
doc_chunks = SemanticChunker(
    embedding=embedding,
    breakpoint_threshold=0.3,
    min_chunk_size=100,
).split(docs)

retriever = VectorRetriever.from_chunks(
    CatalogChunker().split(catalog) + doc_chunks,
    embedding=embedding,
)

# from_sources ํŒจํ„ด: splitter ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ์ „๋‹ฌ
retriever = VectorRetriever.from_sources(
    catalog=catalog,
    documents=docs,
    embedding=embedding,
    splitter=SemanticChunker(embedding=embedding),
)

LangChain ์ฒญ์ปค ์–ด๋Œ‘ํ„ฐ

from langchain_text_splitters import RecursiveCharacterTextSplitter
from lang2sql import IndexedChunk, TextDocument

class LangChainChunkerAdapter:
    def __init__(self, splitter):
        self._splitter = splitter

    def chunk(self, doc: TextDocument) -> list[IndexedChunk]:
        texts = self._splitter.split_text(doc["content"])
        title = doc.get("title", "")
        return [
            IndexedChunk(
                chunk_id=f"{doc['id']}__{i}",
                text=f"{title}: {text}" if title else text,
                source_type="document",
                source_id=doc["id"],
                chunk_index=i,
                metadata={"title": title, "source": doc.get("source", "")},
            )
            for i, text in enumerate(texts)
        ]

lc_chunker = LangChainChunkerAdapter(
    RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
)

retriever = VectorRetriever.from_sources(
    catalog=catalog,
    documents=docs,
    embedding=OpenAIEmbedding(),
    splitter=lc_chunker,
)

5) DataHub ์นดํƒˆ๋กœ๊ทธ ๋กœ๋”

DataHub GMS ์„œ๋ฒ„์—์„œ ํ…Œ์ด๋ธ” ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ๋ฅผ ๊ฐ€์ ธ์™€ CatalogEntry ๋ชฉ๋ก์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค. ์ˆ˜๋™์œผ๋กœ ์นดํƒˆ๋กœ๊ทธ๋ฅผ ์ž‘์„ฑํ•˜์ง€ ์•Š์•„๋„ DataHub์— ๋“ฑ๋ก๋œ ์Šคํ‚ค๋งˆ ์ •๋ณด๋ฅผ ๋ฐ”๋กœ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

pip install acryl-datahub
from lang2sql.integrations.catalog import DataHubCatalogLoader

loader = DataHubCatalogLoader(
    gms_server="http://localhost:8080",
    extra_headers={"Authorization": "Bearer <token>"},
)

# ์ „์ฒด URN ์กฐํšŒ
catalog = loader.load()

# ํŠน์ • URN๋งŒ ์กฐํšŒ
catalog = loader.load(urns=[
    "urn:li:dataset:(urn:li:dataPlatform:postgres,mydb.public.orders,PROD)",
    "urn:li:dataset:(urn:li:dataPlatform:postgres,mydb.public.customers,PROD)",
])

# ๋ฐ”๋กœ ํŒŒ์ดํ”„๋ผ์ธ์— ์—ฐ๊ฒฐ
from lang2sql import BaselineNL2SQL
from lang2sql.integrations.db import SQLAlchemyDB
from lang2sql.integrations.llm import OpenAILLM

pipeline = BaselineNL2SQL(
    catalog=catalog,
    llm=OpenAILLM(model="gpt-4o-mini"),
    db=SQLAlchemyDB("postgresql://user:pass@localhost:5432/mydb"),
    db_dialect="postgresql",
)

DataHubCatalogLoader๋Š” CatalogLoaderPort๋ฅผ ๊ตฌํ˜„ํ•ฉ๋‹ˆ๋‹ค. DataHub ์—†์ด๋„ SQLAlchemyExplorer๋กœ DDL์„ ์ง์ ‘ ์กฐํšŒํ•˜๊ฑฐ๋‚˜ CSV/์ˆ˜๋™ ์นดํƒˆ๋กœ๊ทธ๋ฅผ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.


6) Port ํ”„๋กœํ† ์ฝœ ๋ ˆํผ๋Ÿฐ์Šค

์ปค์Šคํ…€ ์–ด๋Œ‘ํ„ฐ๋ฅผ ์ž‘์„ฑํ•  ๋•Œ ๊ตฌํ˜„ํ•ด์•ผ ํ•˜๋Š” ๋ฉ”์„œ๋“œ ๋ชฉ๋ก์ž…๋‹ˆ๋‹ค.

Port ๋ฉ”์„œ๋“œ ์‹œ๊ทธ๋‹ˆ์ฒ˜ ์šฉ๋„
LLMPort invoke (messages: list[dict]) -> str LLM ๋ฐฑ์—”๋“œ ๊ต์ฒด
DBPort execute (sql: str) -> list[dict] DB ๋ฐฑ์—”๋“œ ๊ต์ฒด
EmbeddingPort embed_query (text: str) -> list[float] ๋‹จ์ผ ํ…์ŠคํŠธ ์ž„๋ฒ ๋”ฉ
embed_texts (texts: list[str]) -> list[list[float]] ๋ฐฐ์น˜ ์ž„๋ฒ ๋”ฉ
VectorStorePort upsert (ids: list[str], vectors: list[list[float]]) -> None ๋ฒกํ„ฐ ์ €์žฅ
search (vector: list[float], k: int) -> list[tuple[str, float]] ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ (id, score)
DocumentLoaderPort load () -> list[TextDocument] ๋ฌธ์„œ ๋กœ๋“œ
DocumentChunkerPort chunk (doc: TextDocument) -> list[IndexedChunk] ๋ฌธ์„œ ๋ถ„ํ• 
CatalogLoaderPort load (urns: list[str] | None) -> list[CatalogEntry] ์™ธ๋ถ€ ์นดํƒˆ๋กœ๊ทธ ๋กœ๋“œ
DBExplorerPort list_tables () -> list[str] ํ…Œ์ด๋ธ” ๋ชฉ๋ก
get_ddl (table: str) -> str DDL ์กฐํšŒ
sample_data (table: str, limit: int) -> list[dict] ์ƒ˜ํ”Œ ๋ฐ์ดํ„ฐ
execute_read_only (sql: str) -> list[dict] ์ฝ๊ธฐ ์ „์šฉ ์ฟผ๋ฆฌ

๋ชจ๋“  Port๋Š” src/lang2sql/core/ports.py์— Protocol๋กœ ์ •์˜๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ํด๋ž˜์Šค ์ƒ์† ์—†์ด ๋ฉ”์„œ๋“œ ์‹œ๊ทธ๋‹ˆ์ฒ˜๋งŒ ๋งž์ถ”๋ฉด ์–ด๋–ค ๊ฐ์ฒด๋“  ์—ฐ๊ฒฐํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค (structural subtyping).


7) Hook โ€” ๊ด€์ธก์„ฑ๊ณผ ๋””๋ฒ„๊น…

MemoryHook์œผ๋กœ ์ปดํฌ๋„ŒํŠธ ๋‹จ์œ„ ์‹คํ–‰ ์ด๋ฒคํŠธ๋ฅผ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.

from lang2sql import HybridNL2SQL, MemoryHook
from lang2sql.integrations.db import SQLAlchemyDB
from lang2sql.integrations.embedding import OpenAIEmbedding
from lang2sql.integrations.llm import OpenAILLM

hook = MemoryHook()

pipeline = HybridNL2SQL(
    catalog=catalog,
    llm=OpenAILLM(model="gpt-4o-mini"),
    db=SQLAlchemyDB("sqlite:///sample.db"),
    embedding=OpenAIEmbedding(model="text-embedding-3-small"),
    documents=docs,
    db_dialect="sqlite",
    hook=hook,
)

pipeline.run("์ง€๋‚œ๋‹ฌ ์ˆœ๋งค์ถœ ํ•ฉ๊ณ„")

for e in hook.snapshot():
    print(f"{e.component:30s}  {e.phase:5s}  {e.duration_ms:6.1f}ms  error={e.error}")

์ถœ๋ ฅ ์˜ˆ์‹œ:

HybridRetriever                start   0.0ms  error=None
HybridRetriever                end    12.3ms  error=None
SQLGenerator                   start   0.0ms  error=None
SQLGenerator                   end   890.1ms  error=None
SQLExecutor                    start   0.0ms  error=None
SQLExecutor                    end     1.2ms  error=None

์šด์˜ ํ™˜๊ฒฝ์—์„œ๋Š” duration_ms๋กœ ๋ณ‘๋ชฉ์„ ํŒŒ์•…ํ•˜๊ณ  error ์ด๋ฒคํŠธ๋ฅผ ์ˆ˜์ง‘ํ•ด ์žฅ์•  ํŒจํ„ด์„ ๋ถ„์„ํ•ฉ๋‹ˆ๋‹ค.


8) Best Practices ์ฒดํฌ๋ฆฌ์ŠคํŠธ

์นดํƒˆ๋กœ๊ทธ ์ž‘์„ฑ

  • description์€ ํ•œ ๋ฌธ์žฅ์œผ๋กœ ํ…Œ์ด๋ธ” ์šฉ๋„๋ฅผ ๋ช…ํ™•ํžˆ ๊ธฐ์ˆ 
  • columns๋Š” ๋น„์ฆˆ๋‹ˆ์Šค ์šฉ์–ด์™€ ์ปฌ๋Ÿผ๋ช… ๋งคํ•‘์„ ์ถฉ์‹คํžˆ ์ž‘์„ฑ
  • ๊ด€๋ จ ํ…Œ์ด๋ธ” ๊ฐ„ FK ๊ด€๊ณ„๋ฅผ ์ปฌ๋Ÿผ ์„ค๋ช…์— ๋ช…์‹œ

๊ฒ€์ƒ‰ ํŒŒ๋ผ๋ฏธํ„ฐ

  • top_n: 3~8๋กœ ์‹œ์ž‘ํ•ด ์‹คํ—˜ (๋„ˆ๋ฌด ๋งŽ์œผ๋ฉด LLM ํ”„๋กฌํ”„ํŠธ ๋น„์šฉ ์ฆ๊ฐ€)
  • score_threshold: 0.0์œผ๋กœ ์‹œ์ž‘ ํ›„ ๊ด€๋ จ ์—†๋Š” ํ…Œ์ด๋ธ”์ด ๊ฒ€์ƒ‰๋  ๋•Œ 0.3~0.5๋กœ ์ƒํ–ฅ
  • rrf_k (HybridRetriever): ๊ธฐ๋ณธ๊ฐ’ 60, ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ˆœ์œ„ ๋ฏผ๊ฐ๋„ ์กฐ์ •

์ฒญํ‚น

  • ๊ธฐ๋ณธ์€ RecursiveCharacterChunker
  • ๋ฌธ์„œ ํ’ˆ์งˆ์ด ์ค‘์š”ํ•˜๊ณ  ๋น„์šฉ ํ—ˆ์šฉ ์‹œ SemanticChunker ๊ฒ€ํ† 
  • chunk_overlap์€ ๋ฐ˜๋“œ์‹œ chunk_size๋ณด๋‹ค ์ž‘๊ฒŒ ์„ค์ •

ํ”Œ๋กœ์šฐ ์„ ํƒ

์šฐ์„  ์ˆœ์œ„ ํŒŒ์ดํ”„๋ผ์ธ
๋น ๋ฅธ ์‹œ์ž‘ BaselineNL2SQL
๊ฒ€์ƒ‰ ํ’ˆ์งˆ HybridNL2SQL
์šด์˜ ํ™˜๊ฒฝ EnrichedNL2SQL
์„ธ๋ฐ€ํ•œ ์ œ์–ด ์ˆ˜๋™ ์ปดํฌ๋„ŒํŠธ ์กฐํ•ฉ

9) ํŠธ๋Ÿฌ๋ธ”์ŠˆํŒ…

IntegrationMissingError: openai

pip install openai

chunk_overlap must be less than chunk_size

RecursiveCharacterChunker์˜ chunk_overlap < chunk_size ์กฐ๊ฑด ์œ„๋ฐ˜. ํŒŒ๋ผ๋ฏธํ„ฐ๋ฅผ ์ˆ˜์ •ํ•˜์„ธ์š”.

VectorRetriever ๊ฒฐ๊ณผ๊ฐ€ ๋น„์–ด ์žˆ์Œ

  1. from_chunks() ๋˜๋Š” from_sources()๊ฐ€ ์‹ค์ œ๋กœ ํ˜ธ์ถœ๋๋Š”์ง€ ํ™•์ธ
  2. len(retriever._registry) > 0 ํ™•์ธ
  3. score_threshold๋ฅผ 0.0์œผ๋กœ ๋‚ฎ์ถฐ์„œ ํ…Œ์ŠคํŠธ

retriever.add() ํƒ€์ž… ์—๋Ÿฌ

add()๋Š” list[IndexedChunk]๋งŒ ๋ฐ›์Šต๋‹ˆ๋‹ค. TextDocument๋ฅผ ์ง์ ‘ ์ „๋‹ฌํ•˜๋ฉด ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ•ฉ๋‹ˆ๋‹ค.

# โŒ ๋™์ž‘ ์•ˆ ํ•จ
retriever.add(docs)

# โœ… ์˜ฌ๋ฐ”๋ฅธ ๋ฐฉ๋ฒ•
retriever.add(RecursiveCharacterChunker().split(docs))

IntegrationMissingError: pymupdf (PDFLoader)

pip install pymupdf

ContractError (EnrichedNL2SQL)

QuestionGate๊ฐ€ SQL๋กœ ๋‹ตํ•  ์ˆ˜ ์—†๋‹ค๊ณ  ํŒ๋‹จํ•œ ๊ฒฝ์šฐ์ž…๋‹ˆ๋‹ค. gate_enabled=False๋กœ ๋น„ํ™œ์„ฑํ™”ํ•˜๊ฑฐ๋‚˜ ์งˆ๋ฌธ์„ SQL ๊ด€๋ จ์œผ๋กœ ๊ตฌ์ฒดํ™”ํ•˜์„ธ์š”.