Coverage for mindsdb / integrations / utilities / rag / utils.py: 23%

20 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1from typing import List 

2 

3import pandas as pd 

4from langchain_core.documents import Document 

5from langchain_core.embeddings import Embeddings 

6 

7 

8def df_to_documents(df: pd.DataFrame, content_column_name: str) -> List[Document]: 

9 """ 

10 Given a dataframe, convert it to a list of documents. 

11 

12 :param df: pd.DataFrame 

13 :param content_column_name: str 

14 

15 :return: List[Document] 

16 """ 

17 documents = [] 

18 for _, row in df.iterrows(): 

19 metadata = row.to_dict() 

20 page_content = metadata.pop(content_column_name) 

21 documents.append(Document(page_content=page_content, metadata=metadata)) 

22 return documents 

23 

24 

25def documents_to_df(content_column_name: str, 

26 documents: List[Document], 

27 embedding_model: Embeddings = None, 

28 with_embeddings: bool = False) -> pd.DataFrame: 

29 """ 

30 Given a list of documents, convert it to a dataframe. 

31 

32 :param content_column_name: str 

33 :param documents: List[Document] 

34 :param embedding_model: Embeddings 

35 :param with_embeddings: bool 

36 

37 :return: pd.DataFrame 

38 """ 

39 df = pd.DataFrame([doc.metadata for doc in documents]) 

40 

41 df[content_column_name] = [doc.page_content for doc in documents] 

42 

43 if 'date' in df.columns: 

44 df['date'] = pd.to_datetime(df['date'], errors='coerce') 

45 

46 # Reordering the columns to have the content column first. 

47 df = df[[content_column_name] + [col for col in df.columns if col != content_column_name]] 

48 

49 if with_embeddings: 

50 df["embeddings"] = embedding_model.embed_documents(df[content_column_name].tolist()) 

51 

52 return df