simonzhang5429 commited on
Commit
5e30561
1 Parent(s): e0facf2

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import shutil
4
+ from pypdf import PdfReader
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import torch
7
+ import fitz
8
+
9
+ TOKENIZER_REPO = "MediaTek-Research/Breeze-7B-Instruct-v1_0"
10
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO,local_files_only=False,use_fast=True)
11
+ tran_hints = "请将以下的文字转为繁体:"
12
+ start_flag="<s>"
13
+ end_flag="</s>"
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ TOKENIZER_REPO,
16
+ device_map="auto",
17
+ local_files_only=False,
18
+ torch_dtype=torch.bfloat16
19
+ )
20
+
21
+ def generate(text):
22
+ chat_data = []
23
+ text = text.strip()
24
+ if text:
25
+ chat_data.append({"role": "system", "content": text})
26
+ achat=tokenizer.apply_chat_template(chat_data,return_tensors="pt")
27
+ #achat=tokenizer.encode(chat_data,return_tensors="pt",max_length=2048)
28
+ outputs = model.generate(achat.to('cuda'),
29
+ max_new_tokens=2048,
30
+ top_p=0.01,
31
+ top_k=85,
32
+ repetition_penalty=1.1,
33
+ temperature=0)
34
+
35
+ return tokenizer.decode(outputs[0])
36
+
37
+ def tran_txt(input_txt):
38
+ data_txt=tran_hints+"\n"+input_txt.strip()
39
+ tran_result=generate(data_txt)
40
+ print("tran_result="+tran_result)
41
+ # tran_result=tran_result.strip()
42
+ # index=tran_result.find(start_flag)
43
+ # if index>=0:
44
+ # tran_result=tran_result[len(start_flag):]
45
+ # tran_result=tran_result.strip()
46
+ # c_index=tran_result.find(data_txt)
47
+ # if c_index>=0:
48
+ # tran_result=tran_result[len(data_txt):]
49
+ # e_index=tran_result.find(end_flag)
50
+ # if e_index>=0:
51
+ # tran_result=tran_result[0:e_index]
52
+ return tran_result
53
+
54
+ def exec_tran(file):
55
+ temp_file=upload_file(file)
56
+ page_texts=read_paragraphs(temp_file)
57
+ temp_result_file=file;
58
+ file_index=temp_result_file.index('.pdf')
59
+ if file_index!=-1:
60
+ temp_result_file=temp_result_file[0:file_index]
61
+ temp_result_file=temp_result_file+"_result.txt"
62
+ else :
63
+ temp_result_file=temp_result_file+"_result.txt"
64
+ tran_file_name=file.name
65
+ with open(temp_result_file,'w') as fw:
66
+ tran_result=tran_txt(tran_hints)
67
+ # print(tran_result+"\n")
68
+ for page_content in page_texts:
69
+ #lines=page_content.split('\n')
70
+ #for line_content in lines:
71
+ #print("input="+line_content)
72
+ tran_result=tran_txt(page_content)
73
+ # print("result="+tran_result)
74
+ fw.write(tran_result+"\n")
75
+ return temp_result_file
76
+
77
+ def upload_file(file):
78
+ UPLOAD_FOLDER="./data"
79
+ if not os.path.exists(UPLOAD_FOLDER):
80
+ os.mkdir(UPLOAD_FOLDER)
81
+ return shutil.copy(file,UPLOAD_FOLDER)
82
+
83
+ def read_paragraphs(pdf_path):
84
+ document = fitz.open(pdf_path)
85
+ paragraphs = []
86
+
87
+ for page in document:
88
+ text = page.get_text("paragraphs")
89
+ para_list = text.split('。')
90
+ paragraphs.extend([para for para in para_list if para.strip()])
91
+ document.close()
92
+ return paragraphs
93
+
94
+ def load_pdf_pages(filename):
95
+ page_texts=[]
96
+ reader = PdfReader(filename)
97
+ for page in reader.pages:
98
+ page_texts.append(page.extract_text())
99
+ return page_texts
100
+
101
+ def exec_translate(file):
102
+ upload_file(file)
103
+ page_texts=load_pdf_pages(file.name)
104
+
105
+ with gr.Blocks() as app:
106
+ file_output=gr.File()
107
+ upload_button=gr.UploadButton("上传pdf文件",file_types=["pdf"],file_count="single")
108
+ upload_button.upload(exec_tran,upload_button,file_output)
109
+
110
+ app.launch()