Coverage for mindsdb / integrations / utilities / rag / utils.py: 23%
20 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1from typing import List
3import pandas as pd
4from langchain_core.documents import Document
5from langchain_core.embeddings import Embeddings
8def df_to_documents(df: pd.DataFrame, content_column_name: str) -> List[Document]:
9 """
10 Given a dataframe, convert it to a list of documents.
12 :param df: pd.DataFrame
13 :param content_column_name: str
15 :return: List[Document]
16 """
17 documents = []
18 for _, row in df.iterrows():
19 metadata = row.to_dict()
20 page_content = metadata.pop(content_column_name)
21 documents.append(Document(page_content=page_content, metadata=metadata))
22 return documents
25def documents_to_df(content_column_name: str,
26 documents: List[Document],
27 embedding_model: Embeddings = None,
28 with_embeddings: bool = False) -> pd.DataFrame:
29 """
30 Given a list of documents, convert it to a dataframe.
32 :param content_column_name: str
33 :param documents: List[Document]
34 :param embedding_model: Embeddings
35 :param with_embeddings: bool
37 :return: pd.DataFrame
38 """
39 df = pd.DataFrame([doc.metadata for doc in documents])
41 df[content_column_name] = [doc.page_content for doc in documents]
43 if 'date' in df.columns:
44 df['date'] = pd.to_datetime(df['date'], errors='coerce')
46 # Reordering the columns to have the content column first.
47 df = df[[content_column_name] + [col for col in df.columns if col != content_column_name]]
49 if with_embeddings:
50 df["embeddings"] = embedding_model.embed_documents(df[content_column_name].tolist())
52 return df