KingNish commited on
Commit
dbc91d5
1 Parent(s): 459ea62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -11
app.py CHANGED
@@ -7,6 +7,7 @@ from huggingface_hub import InferenceClient
7
  import re
8
  import zipfile
9
  import xml.etree.ElementTree as ET
 
10
 
11
  # Constants
12
  CHUNK_SIZE = 32000
@@ -95,12 +96,20 @@ def extract_text_from_pptx(pptx_data, clean=True):
95
  def read_document(file, clean=True):
96
  """Reads content from various document formats."""
97
  file_path = file.name
98
- file_extension = file_path.split('.')[-1].lower()
99
 
100
  with open(file_path, "rb") as f:
101
  file_content = f.read()
102
 
103
- if file_extension == 'pdf':
 
 
 
 
 
 
 
 
104
  try:
105
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
106
  content = ''
@@ -111,8 +120,8 @@ def read_document(file, clean=True):
111
  return content, len(content)
112
  except Exception as e:
113
  return f"Error reading PDF: {e}", 0
114
-
115
- elif file_extension == 'xlsx':
116
  try:
117
  wb = load_workbook(io.BytesIO(file_content))
118
  content = ''
@@ -126,18 +135,32 @@ def read_document(file, clean=True):
126
  return content, len(content)
127
  except Exception as e:
128
  return f"Error reading XLSX: {e}", 0
129
-
130
- elif file_extension == 'pptx':
131
  try:
132
- return extract_text_from_pptx(file_content, clean)
 
 
 
133
  except Exception as e:
134
- return f"Error reading PPTX: {e}", 0
135
-
136
- elif file_extension == 'doc' or file_extension == 'docx':
 
 
 
 
 
 
 
137
  try:
138
  return extract_text_from_docx(file_content, clean)
139
  except Exception as e:
140
- return f"Error reading DOC/DOCX: {e}", 0
 
 
 
 
 
141
 
142
  else:
143
  try:
 
7
  import re
8
  import zipfile
9
  import xml.etree.ElementTree as ET
10
+ import filetype
11
 
12
  # Constants
13
  CHUNK_SIZE = 32000
 
96
  def read_document(file, clean=True):
97
  """Reads content from various document formats."""
98
  file_path = file.name
99
+ # No file extension used
100
 
101
  with open(file_path, "rb") as f:
102
  file_content = f.read()
103
 
104
+ kind = filetype.guess(file_content)
105
+
106
+ if kind is None:
107
+ return "Cannot guess file type", 0 # Handle unknown file types
108
+
109
+ mime = kind.mime
110
+
111
+ if mime == "application/pdf":
112
+ # PDF Handling (unchanged)
113
  try:
114
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
115
  content = ''
 
120
  return content, len(content)
121
  except Exception as e:
122
  return f"Error reading PDF: {e}", 0
123
+ elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
124
+ # XLSX Handling (unchanged)
125
  try:
126
  wb = load_workbook(io.BytesIO(file_content))
127
  content = ''
 
135
  return content, len(content)
136
  except Exception as e:
137
  return f"Error reading XLSX: {e}", 0
138
+ elif mime == "text/plain":
 
139
  try:
140
+ content = file_content.decode('utf-8')
141
+ if clean:
142
+ content = clean_text(content)
143
+ return content, len(content)
144
  except Exception as e:
145
+ return f"Error reading TXT file: {e}", 0
146
+ elif mime == "text/csv":
147
+ try:
148
+ content = file_content.decode('utf-8')
149
+ if clean:
150
+ content = clean_text(content)
151
+ return content, len(content)
152
+ except Exception as e:
153
+ return f"Error reading CSV file: {e}", 0
154
+ elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
155
  try:
156
  return extract_text_from_docx(file_content, clean)
157
  except Exception as e:
158
+ return f"Error reading DOCX: {e}", 0
159
+ elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
160
+ try:
161
+ return extract_text_from_pptx(file_content, clean)
162
+ except Exception as e:
163
+ return f"Error reading PPTX: {e}", 0
164
 
165
  else:
166
  try: