davila7 commited on
Commit
d299aec
1 Parent(s): 302ccf8

pptx support

Browse files
Files changed (3) hide show
  1. __pycache__/utils.cpython-310.pyc +0 -0
  2. app.py +12 -3
  3. utils.py +14 -0
__pycache__/utils.cpython-310.pyc CHANGED
Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ
 
app.py CHANGED
@@ -6,6 +6,7 @@ from utils import (
6
  parse_pdf,
7
  parse_txt,
8
  parse_csv,
 
9
  search_docs,
10
  embed_docs,
11
  text_to_docs,
@@ -39,7 +40,7 @@ with st.sidebar:
39
 
40
  uploaded_file = st.file_uploader(
41
  "Upload a pdf, docx, or txt file",
42
- type=["pdf", "docx", "txt", "csv"],
43
  help="Scanned documents are not supported yet!",
44
  on_change=clear_submit,
45
  )
@@ -53,10 +54,13 @@ with st.sidebar:
53
  doc = parse_csv(uploaded_file)
54
  elif uploaded_file.name.endswith(".txt"):
55
  doc = parse_txt(uploaded_file)
 
 
56
  else:
57
  st.error("File type not supported")
58
  doc = None
59
  text = text_to_docs(doc)
 
60
  try:
61
  with st.spinner("Indexing document... This may take a while⏳"):
62
  index = embed_docs(text)
@@ -67,8 +71,13 @@ with st.sidebar:
67
  tab1, tab2 = st.tabs(["Intro", "Chat with the File"])
68
  with tab1:
69
  st.markdown("### How does it work?")
70
- st.markdown('<p>Read the article to know how it works: <a target="_blank" href="https://medium.com/@dan.avila7/file-gpt-conversaci%C3%B3n-por-chat-con-un-archivo-698d17570358">Medium Article</a></p>', unsafe_allow_html=True)
71
- st.write("File GPT was written with the following tools:")
 
 
 
 
 
72
  st.markdown("#### Code GPT")
73
  st.write('All code was written with the help of Code GPT. Visit https://codegpt.co to get the extension.')
74
  st.markdown("#### Streamlit")
 
6
  parse_pdf,
7
  parse_txt,
8
  parse_csv,
9
+ parse_pptx,
10
  search_docs,
11
  embed_docs,
12
  text_to_docs,
 
40
 
41
  uploaded_file = st.file_uploader(
42
  "Upload a pdf, docx, or txt file",
43
+ type=["pdf", "docx", "txt", "csv", "pptx"],
44
  help="Scanned documents are not supported yet!",
45
  on_change=clear_submit,
46
  )
 
54
  doc = parse_csv(uploaded_file)
55
  elif uploaded_file.name.endswith(".txt"):
56
  doc = parse_txt(uploaded_file)
57
+ elif uploaded_file.name.endswith(".pptx"):
58
+ doc = parse_pptx(uploaded_file)
59
  else:
60
  st.error("File type not supported")
61
  doc = None
62
  text = text_to_docs(doc)
63
+ st.write(text)
64
  try:
65
  with st.spinner("Indexing document... This may take a while⏳"):
66
  index = embed_docs(text)
 
71
  tab1, tab2 = st.tabs(["Intro", "Chat with the File"])
72
  with tab1:
73
  st.markdown("### How does it work?")
74
+ st.write("File GPT is a tool that allows you to ask questions about a document and get answers from the document. The tool uses the OpenAI API to embed the document and then uses the Embedding API to find the most similar documents to the question. The tool then uses LangChain to obtain the answer from the most similar documents.")
75
+ st.write("The tool is currently in beta and is not perfect. It is recommended to use it with short documents.")
76
+ st.write("""---""")
77
+ st.markdown("### How to use it?")
78
+ st.write("To use the tool you must first add your OpenAI API Key and then upload a document. The tool currently supports the following file types: pdf, docx, txt, csv, pptx. Once the document is uploaded, the tool will index the document and embed it. This may take a while depending on the size of the document. Once the document is indexed, you can ask questions about the document. The tool will return the answer to the question and the source of the answer.")
79
+ st.markdown('<p>Read the article to know more details: <a target="_blank" href="https://medium.com/@dan.avila7/file-gpt-conversaci%C3%B3n-por-chat-con-un-archivo-698d17570358">Medium Article (Spanish)</a></p>', unsafe_allow_html=True)
80
+ st.write("## File GPT was written with the following tools:")
81
  st.markdown("#### Code GPT")
82
  st.write('All code was written with the help of Code GPT. Visit https://codegpt.co to get the extension.')
83
  st.markdown("#### Streamlit")
utils.py CHANGED
@@ -16,6 +16,7 @@ import streamlit as st
16
  from prompts import STUFF_PROMPT
17
  from pypdf import PdfReader
18
  from openai.error import AuthenticationError
 
19
 
20
  @st.experimental_memo()
21
  def parse_docx(file: BytesIO) -> str:
@@ -50,6 +51,19 @@ def parse_txt(file: BytesIO) -> str:
50
  text = re.sub(r"\n\s*\n", "\n\n", text)
51
  return text
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  @st.experimental_memo()
54
  def parse_csv(uploaded_file):
55
  # To read file as bytes:
 
16
  from prompts import STUFF_PROMPT
17
  from pypdf import PdfReader
18
  from openai.error import AuthenticationError
19
+ import pptx
20
 
21
  @st.experimental_memo()
22
  def parse_docx(file: BytesIO) -> str:
 
51
  text = re.sub(r"\n\s*\n", "\n\n", text)
52
  return text
53
 
54
+ @st.experimental_memo()
55
+ def parse_pptx(file: BytesIO) -> str:
56
+
57
+ ppt_file = pptx.Presentation(file)
58
+
59
+ string_data = ""
60
+
61
+ for slide in ppt_file.slides:
62
+ for shape in slide.shapes:
63
+ if shape.has_text_frame:
64
+ string_data += shape.text_frame.text + '\n'
65
+ return string_data
66
+
67
  @st.experimental_memo()
68
  def parse_csv(uploaded_file):
69
  # To read file as bytes: