Farid Karimli commited on
Commit
6581a76
1 Parent(s): 351c4c7

Chunking strategy start

Browse files
code/modules/dataloader/data_loader.py CHANGED
@@ -14,6 +14,8 @@ from llama_parse import LlamaParse
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
17
  from ragatouille import RAGPretrainedModel
18
  from langchain.chains import LLMChain
19
  from langchain_community.llms import OpenAI
@@ -67,8 +69,7 @@ class HTMLReader:
67
 
68
  resp = requests.head(absolute_url)
69
  if resp.status_code != 200:
70
- logger.warning(f"Link {absolute_url} is broken")
71
- logger.warning(f"Status code: {resp.status_code}")
72
 
73
  return str(soup)
74
 
@@ -154,21 +155,31 @@ class ChunkProcessor:
154
  self.document_metadata = {}
155
  self.document_chunks_full = []
156
 
 
 
 
157
  if config["splitter_options"]["use_splitter"]:
158
- if config["splitter_options"]["split_by_token"]:
159
- self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
160
- chunk_size=config["splitter_options"]["chunk_size"],
161
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
162
- separators=config["splitter_options"]["chunk_separators"],
163
- disallowed_special=(),
164
- )
 
 
 
 
 
 
 
 
165
  else:
166
- self.splitter = RecursiveCharacterTextSplitter(
167
- chunk_size=config["splitter_options"]["chunk_size"],
168
- chunk_overlap=config["splitter_options"]["chunk_overlap"],
169
- separators=config["splitter_options"]["chunk_separators"],
170
- disallowed_special=(),
171
  )
 
172
  else:
173
  self.splitter = None
174
  self.logger.info("ChunkProcessor instance created")
@@ -191,16 +202,11 @@ class ChunkProcessor:
191
  def process_chunks(
192
  self, documents, file_type="txt", source="", page=0, metadata={}
193
  ):
194
- documents = [Document(page_content=documents, source=source, page=page)]
195
- if (
196
- file_type == "txt"
197
- or file_type == "docx"
198
- or file_type == "srt"
199
- or file_type == "tex"
200
- ):
201
  document_chunks = self.splitter.split_documents(documents)
202
- elif file_type == "pdf":
203
- document_chunks = documents # Full page for now
204
 
205
  # add the source and page number back to the metadata
206
  for chunk in document_chunks:
@@ -294,9 +300,6 @@ class ChunkProcessor:
294
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
295
  file_name = os.path.basename(file_path)
296
 
297
- if file_name in self.document_data:
298
- return
299
-
300
  file_type = file_name.split(".")[-1]
301
 
302
  read_methods = {
@@ -311,7 +314,11 @@ class ChunkProcessor:
311
  return
312
 
313
  try:
314
- documents = read_methods[file_type](file_path)
 
 
 
 
315
 
316
  self.process_documents(
317
  documents, file_path, file_type, "file", addl_metadata
@@ -370,6 +377,9 @@ class ChunkProcessor:
370
  f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
371
  ) as json_file:
372
  self.document_metadata = json.load(json_file)
 
 
 
373
 
374
 
375
  class DataLoader:
 
14
  from langchain.schema import Document
15
  import logging
16
  from langchain.text_splitter import RecursiveCharacterTextSplitter
17
+ from langchain_experimental.text_splitter import SemanticChunker
18
+ from langchain_openai.embeddings import OpenAIEmbeddings
19
  from ragatouille import RAGPretrainedModel
20
  from langchain.chains import LLMChain
21
  from langchain_community.llms import OpenAI
 
69
 
70
  resp = requests.head(absolute_url)
71
  if resp.status_code != 200:
72
+ logger.warning(f"Link {absolute_url} is broken. Status code: {resp.status_code}")
 
73
 
74
  return str(soup)
75
 
 
155
  self.document_metadata = {}
156
  self.document_chunks_full = []
157
 
158
+ if not config['vectorstore']['embedd_files']:
159
+ self.load_document_data()
160
+
161
  if config["splitter_options"]["use_splitter"]:
162
+ if config["splitter_options"]["chunking_mode"] == "fixed":
163
+ if config["splitter_options"]["split_by_token"]:
164
+ self.splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
165
+ chunk_size=config["splitter_options"]["chunk_size"],
166
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
167
+ separators=config["splitter_options"]["chunk_separators"],
168
+ disallowed_special=(),
169
+ )
170
+ else:
171
+ self.splitter = RecursiveCharacterTextSplitter(
172
+ chunk_size=config["splitter_options"]["chunk_size"],
173
+ chunk_overlap=config["splitter_options"]["chunk_overlap"],
174
+ separators=config["splitter_options"]["chunk_separators"],
175
+ disallowed_special=(),
176
+ )
177
  else:
178
+ self.splitter = SemanticChunker(
179
+ OpenAIEmbeddings(),
180
+ breakpoint_threshold_type="percentile"
 
 
181
  )
182
+
183
  else:
184
  self.splitter = None
185
  self.logger.info("ChunkProcessor instance created")
 
202
  def process_chunks(
203
  self, documents, file_type="txt", source="", page=0, metadata={}
204
  ):
205
+ if file_type == "pdf":
206
+ document_chunks = documents
207
+ else:
208
+ documents = [Document(page_content=documents, source=source, page=page)]
 
 
 
209
  document_chunks = self.splitter.split_documents(documents)
 
 
210
 
211
  # add the source and page number back to the metadata
212
  for chunk in document_chunks:
 
300
  def process_file(self, file_path, file_index, file_reader, addl_metadata):
301
  file_name = os.path.basename(file_path)
302
 
 
 
 
303
  file_type = file_name.split(".")[-1]
304
 
305
  read_methods = {
 
314
  return
315
 
316
  try:
317
+ if file_path in self.document_data:
318
+ self.logger.warning(f"File {file_name} already processed")
319
+ documents = [Document(page_content=content) for content in self.document_data[file_path].values()]
320
+ else:
321
+ documents = read_methods[file_type](file_path)
322
 
323
  self.process_documents(
324
  documents, file_path, file_type, "file", addl_metadata
 
377
  f"{self.config['log_chunk_dir']}/metadata/doc_metadata.json", "r"
378
  ) as json_file:
379
  self.document_metadata = json.load(json_file)
380
+ self.logger.info(
381
+ f"Loaded document content from {self.config['log_chunk_dir']}/docs/doc_content.json. Total documents: {len(self.document_data)}"
382
+ )
383
 
384
 
385
  class DataLoader:
code/modules/vectorstore/faiss.py CHANGED
@@ -7,6 +7,10 @@ class FaissVectorStore(VectorStoreBase):
7
  def __init__(self, config):
8
  self.config = config
9
  self._init_vector_db()
 
 
 
 
10
 
11
  def _init_vector_db(self):
12
  self.faiss = FAISS(
@@ -18,24 +22,12 @@ class FaissVectorStore(VectorStoreBase):
18
  documents=document_chunks, embedding=embedding_model
19
  )
20
  self.vectorstore.save_local(
21
- os.path.join(
22
- self.config["vectorstore"]["db_path"],
23
- "db_"
24
- + self.config["vectorstore"]["db_option"]
25
- + "_"
26
- + self.config["vectorstore"]["model"],
27
- )
28
  )
29
 
30
  def load_database(self, embedding_model):
31
  self.vectorstore = self.faiss.load_local(
32
- os.path.join(
33
- self.config["vectorstore"]["db_path"],
34
- "db_"
35
- + self.config["vectorstore"]["db_option"]
36
- + "_"
37
- + self.config["vectorstore"]["model"],
38
- ),
39
  embedding_model,
40
  allow_dangerous_deserialization=True,
41
  )
 
7
  def __init__(self, config):
8
  self.config = config
9
  self._init_vector_db()
10
+ self.local_path = os.path.join(self.config["vectorstore"]["db_path"],
11
+ "db_" + self.config["vectorstore"]["db_option"]
12
+ + "_" + self.config["vectorstore"]["model"]
13
+ + "_" + config["splitter_options"]["chunking_mode"])
14
 
15
  def _init_vector_db(self):
16
  self.faiss = FAISS(
 
22
  documents=document_chunks, embedding=embedding_model
23
  )
24
  self.vectorstore.save_local(
25
+ self.local_path
 
 
 
 
 
 
26
  )
27
 
28
  def load_database(self, embedding_model):
29
  self.vectorstore = self.faiss.load_local(
30
+ self.local_path,
 
 
 
 
 
 
31
  embedding_model,
32
  allow_dangerous_deserialization=True,
33
  )