adi-123 commited on
Commit
1254333
β€’
1 Parent(s): f6c162d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -35
app.py CHANGED
@@ -2,57 +2,52 @@ import os
2
  import streamlit as st
3
  import requests
4
  from transformers import pipeline
5
- from together import Together
6
  from typing import Dict
7
 
8
  # Image-to-text
9
  def img2txt(url: str) -> str:
10
- st.info("Initializing captioning model...")
11
  captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
12
 
13
- st.info("Generating text from the image...")
14
  text = captioning_model(url, max_new_tokens=20)[0]["generated_text"]
15
 
 
16
  return text
17
 
18
- # Text-to-story
19
  def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str:
 
20
  client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
 
 
 
 
 
21
  stream = client.chat.completions.create(
22
  model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
23
  messages=[
24
- {"role": "system", "content": '''You are a creative story writer. Create a meaningful, positive, and inspirational story based on the provided prompt. Ensure it stays under 250 words and ends on a happy note.'''},
25
- {"role": "user", "content": prompt}
 
 
26
  ],
27
  top_k=top_k,
28
  top_p=top_p,
29
  temperature=temperature,
30
  stream=True
31
  )
32
-
 
33
  story = ''
34
  for chunk in stream:
35
  story += chunk.choices[0].delta.content
36
 
37
- # Enforce 250-word limit
38
- story_words = story.split()
39
- if len(story_words) > 250:
40
- story = ' '.join(story_words[:250]) + '...'
41
-
42
- return story
43
-
44
- # Translate story
45
- def translate_story(story: str, target_language: str) -> str:
46
- if target_language != "English":
47
- st.info(f"Translating story to {target_language}...")
48
- translator = pipeline("text2text-generation", model="SnypzZz/Llama2-13b-Language-translate")
49
- translated_story = translator(story, forced_bos_token_id=target_language)
50
- return translated_story[0]["generated_text"]
51
  return story
52
 
53
  # Text-to-speech
54
  def txt2speech(text: str) -> None:
55
- st.info("Initializing text-to-speech conversion...")
56
  API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
57
  headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
58
  payloads = {'inputs': text}
@@ -62,7 +57,17 @@ def txt2speech(text: str) -> None:
62
  with open('audio_story.mp3', 'wb') as file:
63
  file.write(response.content)
64
 
65
- # User preferences
 
 
 
 
 
 
 
 
 
 
66
  def get_user_preferences() -> Dict[str, str]:
67
  preferences = {}
68
 
@@ -86,10 +91,10 @@ def main():
86
  st.title("Turn the Image into Audio Story")
87
 
88
  # Allows users to upload an image file
89
- uploaded_file = st.file_uploader("πŸ“· Upload an image...", type=["jpg", "jpeg", "png"])
90
 
91
  # Parameters for LLM model (in the sidebar)
92
- st.sidebar.markdown("## LLM Inference Configuration Parameters")
93
  top_k = st.sidebar.number_input("Top-K", min_value=1, max_value=100, value=5)
94
  top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
95
  temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
@@ -107,21 +112,23 @@ def main():
107
  st.image(uploaded_file, caption='πŸ–ΌοΈ Uploaded Image', use_column_width=True)
108
 
109
  # Initiates AI processing and story generation
110
- with st.spinner("πŸ€– AI is at Work! "):
111
  scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
112
 
113
  # Modify the prompt to include user preferences
114
- prompt = (f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. "
115
- f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. "
116
- f"The main conflict should be {preferences['conflict']}. "
117
- f"Include {preferences['magic_tech']} as a key element. "
118
- f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending.")
119
 
120
  story = txt2story(prompt, top_k, top_p, temperature) # Generates a story based on the image text, LLM params, and user preferences
121
 
122
- # Translate story based on user-selected language
123
- translated_story = translate_story(story, preferences['language'])
124
- txt2speech(translated_story) # Converts the translated story to audio
 
 
125
 
126
  st.markdown("---")
127
  st.markdown("## πŸ“œ Image Caption")
@@ -129,7 +136,7 @@ def main():
129
 
130
  st.markdown("---")
131
  st.markdown("## πŸ“– Story")
132
- st.write(translated_story)
133
 
134
  st.markdown("---")
135
  st.markdown("## 🎧 Audio Story")
 
2
  import streamlit as st
3
  import requests
4
  from transformers import pipeline
 
5
  from typing import Dict
6
 
7
  # Image-to-text
8
  def img2txt(url: str) -> str:
9
+ print("Initializing captioning model...")
10
  captioning_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
11
 
12
+ print("Generating text from the image...")
13
  text = captioning_model(url, max_new_tokens=20)[0]["generated_text"]
14
 
15
+ print(text)
16
  return text
17
 
18
+ # Text-to-story generation with LLM model
19
  def txt2story(prompt: str, top_k: int, top_p: float, temperature: float) -> str:
20
+ # Load the Together API client
21
  client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
22
+
23
+ # Modify the prompt based on user inputs and ensure a 250-word limit
24
+ story_prompt = f"Write a short story of no more than 250 words based on the following prompt: {prompt}"
25
+
26
+ # Call the LLM model
27
  stream = client.chat.completions.create(
28
  model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
29
  messages=[
30
+ {"role": "system", "content": '''As an experienced short story writer, write a meaningful story influenced by the provided prompt.
31
+ Ensure the story is full of positive inspiration & enthusiasm and concludes with a happy ending.
32
+ Ensure the story does not exceed 250 words.'''},
33
+ {"role": "user", "content": story_prompt}
34
  ],
35
  top_k=top_k,
36
  top_p=top_p,
37
  temperature=temperature,
38
  stream=True
39
  )
40
+
41
+ # Concatenate story chunks
42
  story = ''
43
  for chunk in stream:
44
  story += chunk.choices[0].delta.content
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  return story
47
 
48
  # Text-to-speech
49
  def txt2speech(text: str) -> None:
50
+ print("Initializing text-to-speech conversion...")
51
  API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
52
  headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
53
  payloads = {'inputs': text}
 
57
  with open('audio_story.mp3', 'wb') as file:
58
  file.write(response.content)
59
 
60
+ # Story translation function
61
+ def translate_story(story: str, target_language: str) -> str:
62
+ # Translation pipeline
63
+ translation_model = pipeline("translation", model="SnypzZz/Llama2-13b-Language-translate")
64
+
65
+ print(f"Translating the story to {target_language}...")
66
+ translated_story = translation_model(story, max_length=400, tgt_lang=target_language)[0]['translation_text']
67
+
68
+ return translated_story
69
+
70
+ # Get user preferences for the story
71
  def get_user_preferences() -> Dict[str, str]:
72
  preferences = {}
73
 
 
91
  st.title("Turn the Image into Audio Story")
92
 
93
  # Allows users to upload an image file
94
+ uploaded_file = st.file_uploader("# πŸ“· Upload an image...", type=["jpg", "jpeg", "png"])
95
 
96
  # Parameters for LLM model (in the sidebar)
97
+ st.sidebar.markdown("# LLM Inference Configuration Parameters")
98
  top_k = st.sidebar.number_input("Top-K", min_value=1, max_value=100, value=5)
99
  top_p = st.sidebar.number_input("Top-P", min_value=0.0, max_value=1.0, value=0.8)
100
  temperature = st.sidebar.number_input("Temperature", min_value=0.1, max_value=2.0, value=1.5)
 
112
  st.image(uploaded_file, caption='πŸ–ΌοΈ Uploaded Image', use_column_width=True)
113
 
114
  # Initiates AI processing and story generation
115
+ with st.spinner("## πŸ€– AI is at Work! "):
116
  scenario = img2txt("uploaded_image.jpg") # Extracts text from the image
117
 
118
  # Modify the prompt to include user preferences
119
+ prompt = f"Based on the image description: '{scenario}', create a {preferences['genre']} story set in {preferences['setting']}. " \
120
+ f"The story should have a {preferences['tone']} tone and explore the theme of {preferences['theme']}. " \
121
+ f"The main conflict should be {preferences['conflict']}. " \
122
+ f"Include {preferences['magic_tech']} as a key element. " \
123
+ f"The story should have a {preferences['twist']} and end with a {preferences['ending']} ending."
124
 
125
  story = txt2story(prompt, top_k, top_p, temperature) # Generates a story based on the image text, LLM params, and user preferences
126
 
127
+ # Translate the story if the user selected a non-English language
128
+ if preferences['language'] != "English":
129
+ story = translate_story(story, preferences['language'])
130
+
131
+ txt2speech(story) # Converts the story to audio
132
 
133
  st.markdown("---")
134
  st.markdown("## πŸ“œ Image Caption")
 
136
 
137
  st.markdown("---")
138
  st.markdown("## πŸ“– Story")
139
+ st.write(story)
140
 
141
  st.markdown("---")
142
  st.markdown("## 🎧 Audio Story")