petrsovadina commited on
Commit
f7e1e6e
1 Parent(s): 6b6076c

Update presidio_streamlit.py

Browse files
Files changed (1) hide show
  1. presidio_streamlit.py +76 -272
presidio_streamlit.py CHANGED
@@ -1,12 +1,10 @@
1
- """Streamlit app for Presidio."""
2
  import logging
3
  import os
4
  import traceback
5
 
6
- import dotenv
7
  import pandas as pd
8
  import streamlit as st
9
- import streamlit.components.v1 as components
10
  from annotated_text import annotated_text
11
  from streamlit_tags import st_tags
12
 
@@ -21,7 +19,7 @@ from presidio_helpers import (
21
  )
22
 
23
  st.set_page_config(
24
- page_title="Presidio demo",
25
  layout="wide",
26
  initial_sidebar_state="expanded",
27
  menu_items={
@@ -29,347 +27,153 @@ st.set_page_config(
29
  },
30
  )
31
 
32
- dotenv.load_dotenv()
33
  logger = logging.getLogger("presidio-streamlit")
34
 
35
-
36
- allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
37
-
38
-
39
  # Sidebar
40
- st.sidebar.header(
41
- """
42
- PII De-Identification with [Microsoft Presidio](https://microsoft.github.io/presidio/)
43
- """
44
- )
45
-
46
 
47
  model_help_text = """
48
- Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
49
- Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
50
- as well as service such as Azure Text Analytics PII.
51
  """
52
- st_ta_key = st_ta_endpoint = ""
53
 
54
  model_list = [
55
- "spaCy/en_core_web_lg",
56
  "iiiorg/piiranha-v1-detect-personal-information",
57
- "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
58
  ]
59
- if not allow_other_models:
60
- model_list.pop()
61
- # Select model
62
  st_model = st.sidebar.selectbox(
63
- "NER model package",
64
  model_list,
65
- index=2,
66
  help=model_help_text,
67
  )
68
 
69
- # Extract model package.
70
  st_model_package = st_model.split("/")[0]
 
71
 
72
- # Remove package prefix (if needed)
73
- st_model = (
74
- st_model
75
- if st_model_package.lower() not in ("spacy","piiiranha")
76
- else "/".join(st_model.split("/")[1:])
77
- )
78
-
79
- if st_model == "Other":
80
- st_model_package = st.sidebar.selectbox(
81
- "NER model OSS package", options=["spacy","piiiranha"]
82
- )
83
- st_model = st.sidebar.text_input(f"NER model name", value="")
84
-
85
-
86
-
87
-
88
- st.sidebar.warning("Note: Models might take some time to download. ")
89
-
90
- analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
91
  logger.debug(f"analyzer_params: {analyzer_params}")
92
 
93
  st_operator = st.sidebar.selectbox(
94
- "De-identification approach",
95
- ["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"],
96
- index=1,
97
  help="""
98
- Select which manipulation to the text is requested after PII has been identified.\n
99
- - Redact: Completely remove the PII text\n
100
- - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
101
- - Synthesize: Replace with fake values (requires an OpenAI key)\n
102
- - Highlight: Shows the original text with PII highlighted in colors\n
103
- - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
104
- - Hash: Replaces with the hash of the PII string\n
105
- - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
106
- """,
107
  )
108
- st_mask_char = "*"
109
- st_number_of_chars = 15
110
- st_encrypt_key = "WmZq4t7w!z%C&F)J"
111
-
112
- open_ai_params = None
113
-
114
- logger.debug(f"st_operator: {st_operator}")
115
-
116
-
117
- def set_up_openai_synthesis():
118
- """Set up the OpenAI API key and model for text synthesis."""
119
-
120
- if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
121
- openai_api_type = "azure"
122
- st_openai_api_base = st.sidebar.text_input(
123
- "Azure OpenAI base URL",
124
- value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
125
- )
126
- openai_key = os.getenv("AZURE_OPENAI_KEY", default="")
127
- st_deployment_id = st.sidebar.text_input(
128
- "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
129
- )
130
- st_openai_version = st.sidebar.text_input(
131
- "OpenAI version",
132
- value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
133
- )
134
- else:
135
- openai_api_type = "openai"
136
- st_openai_version = st_openai_api_base = None
137
- st_deployment_id = ""
138
- openai_key = os.getenv("OPENAI_KEY", default="")
139
- st_openai_key = st.sidebar.text_input(
140
- "OPENAI_KEY",
141
- value=openai_key,
142
- help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
143
- type="password",
144
- )
145
- st_openai_model = st.sidebar.text_input(
146
- "OpenAI model for text synthesis",
147
- value=os.getenv("OPENAI_MODEL", default="gpt-3.5-turbo-instruct"),
148
- help="See more here: https://platform.openai.com/docs/models/",
149
- )
150
- return (
151
- openai_api_type,
152
- st_openai_api_base,
153
- st_deployment_id,
154
- st_openai_version,
155
- st_openai_key,
156
- st_openai_model,
157
- )
158
 
 
 
159
 
160
- if st_operator == "mask":
161
  st_number_of_chars = st.sidebar.number_input(
162
- "number of chars", value=st_number_of_chars, min_value=0, max_value=100
163
  )
164
  st_mask_char = st.sidebar.text_input(
165
- "Mask character", value=st_mask_char, max_chars=1
166
- )
167
- elif st_operator == "encrypt":
168
- st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
169
- elif st_operator == "synthesize":
170
- (
171
- openai_api_type,
172
- st_openai_api_base,
173
- st_deployment_id,
174
- st_openai_version,
175
- st_openai_key,
176
- st_openai_model,
177
- ) = set_up_openai_synthesis()
178
-
179
- open_ai_params = OpenAIParams(
180
- openai_key=st_openai_key,
181
- model=st_openai_model,
182
- api_base=st_openai_api_base,
183
- deployment_id=st_deployment_id,
184
- api_version=st_openai_version,
185
- api_type=openai_api_type,
186
  )
187
 
188
  st_threshold = st.sidebar.slider(
189
- label="Acceptance threshold",
190
  min_value=0.0,
191
  max_value=1.0,
192
  value=0.35,
193
- help="Define the threshold for accepting a detection as PII. See more here: ",
194
- )
195
-
196
- st_return_decision_process = st.sidebar.checkbox(
197
- "Add analysis explanations to findings",
198
- value=False,
199
- help="Add the decision process to the output table. "
200
- "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
201
  )
202
 
203
- # Allow and deny lists
204
- st_deny_allow_expander = st.sidebar.expander(
205
- "Allowlists and denylists",
206
- expanded=False,
207
- )
208
-
209
- with st_deny_allow_expander:
210
- st_allow_list = st_tags(
211
- label="Add words to the allowlist", text="Enter word and press enter."
212
- )
213
- st.caption(
214
- "Allowlists contain words that are not considered PII, but are detected as such."
215
- )
216
 
217
- st_deny_list = st_tags(
218
- label="Add words to the denylist", text="Enter word and press enter."
219
- )
220
- st.caption(
221
- "Denylists contain words that are considered PII, but are not detected as such."
222
- )
223
- # Main panel
224
-
225
- with st.expander("About this demo", expanded=False):
226
- st.info(
227
- """Presidio is an open source customizable framework for PII detection and de-identification.
228
- \n\n[Code](https://aka.ms/presidio) |
229
- [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
230
- [Installation](https://microsoft.github.io/presidio/installation/) |
231
- [FAQ](https://microsoft.github.io/presidio/faq/) |
232
- [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
233
- )
234
-
235
- st.info(
236
- """
237
- Use this demo to:
238
- - Experiment with different off-the-shelf models and NLP packages.
239
- - Explore the different de-identification options, including redaction, masking, encryption and more.
240
- - Generate synthetic text with Microsoft Presidio and OpenAI.
241
- - Configure allow and deny lists.
242
-
243
- This demo website shows some of Presidio's capabilities.
244
- [Visit our website](https://microsoft.github.io/presidio) for more info,
245
- samples and deployment options.
246
- """
247
- )
248
 
249
- st.markdown(
250
- "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa
251
- "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
252
- "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
253
- )
254
-
255
- analyzer_load_state = st.info("Starting Presidio analyzer...")
256
-
257
- analyzer_load_state.empty()
258
-
259
- # Read default text
260
- with open("demo_text.txt") as f:
261
- demo_text = f.readlines()
262
-
263
- # Create two columns for before and after
264
  col1, col2 = st.columns(2)
265
 
266
- # Before:
267
- col1.subheader("Input")
268
  st_text = col1.text_area(
269
- label="Enter text", value="".join(demo_text), height=400, key="text_input"
270
  )
271
 
272
  try:
273
- # Choose entities
274
- st_entities_expander = st.sidebar.expander("Choose entities to look for")
275
  st_entities = st_entities_expander.multiselect(
276
- label="Which entities to look for?",
277
  options=get_supported_entities(*analyzer_params),
278
  default=list(get_supported_entities(*analyzer_params)),
279
- help="Limit the list of PII entities detected. "
280
- "This list is dynamic and based on the NER model and registered recognizers. "
281
- "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
282
  )
283
 
284
- # Before
285
- analyzer_load_state = st.info("Starting Presidio analyzer...")
286
  analyzer = analyzer_engine(*analyzer_params)
287
  analyzer_load_state.empty()
288
 
 
289
  st_analyze_results = analyze(
290
  *analyzer_params,
291
  text=st_text,
292
  entities=st_entities,
293
- language="en",
294
  score_threshold=st_threshold,
295
- return_decision_process=st_return_decision_process,
296
- allow_list=st_allow_list,
297
- deny_list=st_deny_list,
298
- )
299
-
300
- # After
301
- if st_operator not in ("highlight", "synthesize"):
302
- with col2:
303
- st.subheader(f"Output")
304
- st_anonymize_results = anonymize(
305
- text=st_text,
306
- operator=st_operator,
307
- mask_char=st_mask_char,
308
- number_of_chars=st_number_of_chars,
309
- encrypt_key=st_encrypt_key,
310
- analyze_results=st_analyze_results,
311
- )
312
- st.text_area(
313
- label="De-identified", value=st_anonymize_results.text, height=400
314
- )
315
- elif st_operator == "synthesize":
316
- with col2:
317
- st.subheader(f"OpenAI Generated output")
318
- fake_data = create_fake_data(
319
- st_text,
320
- st_analyze_results,
321
- open_ai_params,
322
- )
323
- st.text_area(label="Synthetic data", value=fake_data, height=400)
324
  else:
325
- st.subheader("Highlighted")
326
  annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
327
- # annotated_tokens
328
  annotated_text(*annotated_tokens)
329
 
330
- # table result
331
- st.subheader(
332
- "Findings"
333
- if not st_return_decision_process
334
- else "Findings with decision factors"
335
- )
336
  if st_analyze_results:
337
  df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
338
  df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
339
 
340
  df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
341
  {
342
- "entity_type": "Entity type",
343
  "text": "Text",
344
- "start": "Start",
345
- "end": "End",
346
- "score": "Confidence",
347
  },
348
  axis=1,
349
  )
350
- df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
351
- if st_return_decision_process:
352
- analysis_explanation_df = pd.DataFrame.from_records(
353
- [r.analysis_explanation.to_dict() for r in st_analyze_results]
354
- )
355
- df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
356
  st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
357
  else:
358
- st.text("No findings")
359
 
360
  except Exception as e:
361
  print(e)
362
  traceback.print_exc()
363
- st.error(e)
364
-
365
- components.html(
366
- """
367
- <script type="text/javascript">
368
- (function(c,l,a,r,i,t,y){
369
- c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)};
370
- t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i;
371
- y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y);
372
- })(window, document, "clarity", "script", "h7f8bp42n8");
373
- </script>
374
- """
375
- )
 
1
+ """Streamlit app pro anonymizaci českých textů s využitím Presidio."""
2
  import logging
3
  import os
4
  import traceback
5
 
 
6
  import pandas as pd
7
  import streamlit as st
 
8
  from annotated_text import annotated_text
9
  from streamlit_tags import st_tags
10
 
 
19
  )
20
 
21
  st.set_page_config(
22
+ page_title="Anonymizace českých textů",
23
  layout="wide",
24
  initial_sidebar_state="expanded",
25
  menu_items={
 
27
  },
28
  )
29
 
 
30
  logger = logging.getLogger("presidio-streamlit")
31
 
 
 
 
 
32
  # Sidebar
33
+ st.sidebar.header("Anonymizace osobních údajů v českých textech")
 
 
 
 
 
34
 
35
  model_help_text = """
36
+ Vyberte model pro rozpoznávání pojmenovaných entit (NER) pro detekci osobních údajů.
 
 
37
  """
 
38
 
39
  model_list = [
 
40
  "iiiorg/piiranha-v1-detect-personal-information",
41
+ "spaCy/cs_core_news_sm",
42
  ]
43
+
 
 
44
  st_model = st.sidebar.selectbox(
45
+ "NER model",
46
  model_list,
47
+ index=0,
48
  help=model_help_text,
49
  )
50
 
 
51
  st_model_package = st_model.split("/")[0]
52
+ st_model = "/".join(st_model.split("/")[1:])
53
 
54
+ analyzer_params = (st_model_package, st_model, None, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  logger.debug(f"analyzer_params: {analyzer_params}")
56
 
57
  st_operator = st.sidebar.selectbox(
58
+ "Metoda anonymizace",
59
+ ["nahrazení", "maskování", "zvýraznění"],
60
+ index=0,
61
  help="""
62
+ Vyberte způsob anonymizace textu po identifikaci osobních údajů.\n
63
+ - Nahrazení: Nahradí osobní údaj obecným označením, např. <OSOBA>\n
64
+ - Maskování: Nahradí část osobního údaje hvězdičkami\n
65
+ - Zvýraznění: Zvýrazní osobní údaje v původním textu
66
+ """,
 
 
 
 
67
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ st_mask_char = "*"
70
+ st_number_of_chars = 4
71
 
72
+ if st_operator == "maskování":
73
  st_number_of_chars = st.sidebar.number_input(
74
+ "Počet znaků k maskování", value=st_number_of_chars, min_value=0, max_value=100
75
  )
76
  st_mask_char = st.sidebar.text_input(
77
+ "Znak pro maskování", value=st_mask_char, max_chars=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  )
79
 
80
  st_threshold = st.sidebar.slider(
81
+ label="Práh přijetí",
82
  min_value=0.0,
83
  max_value=1.0,
84
  value=0.35,
85
+ help="Definujte práh pro přijetí detekce jako osobní údaj.",
 
 
 
 
 
 
 
86
  )
87
 
88
+ # Hlavní panel
89
+ st.title("Anonymizace českých textů")
 
 
 
 
 
 
 
 
 
 
 
90
 
91
+ # Načtení ukázkového textu
92
+ with open("demo_text.txt", "r", encoding="utf-8") as f:
93
+ demo_text = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ # Vytvoření dvou sloupců pro vstup a výstup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  col1, col2 = st.columns(2)
97
 
98
+ # Vstup
99
+ col1.subheader("Vstupní text")
100
  st_text = col1.text_area(
101
+ label="Zadejte text", value=demo_text, height=400, key="text_input"
102
  )
103
 
104
  try:
105
+ # Výběr entit
106
+ st_entities_expander = st.sidebar.expander("Vyberte entity k detekci")
107
  st_entities = st_entities_expander.multiselect(
108
+ label="Které entity hledat?",
109
  options=get_supported_entities(*analyzer_params),
110
  default=list(get_supported_entities(*analyzer_params)),
111
+ help="Omezte seznam detekovaných osobních údajů.",
 
 
112
  )
113
 
114
+ # Inicializace analyzátoru
115
+ analyzer_load_state = st.info("Spouštění Presidio analyzátoru...")
116
  analyzer = analyzer_engine(*analyzer_params)
117
  analyzer_load_state.empty()
118
 
119
+ # Analýza textu
120
  st_analyze_results = analyze(
121
  *analyzer_params,
122
  text=st_text,
123
  entities=st_entities,
124
+ language="cs",
125
  score_threshold=st_threshold,
126
+ return_decision_process=False,
127
+ allow_list=[],
128
+ deny_list=[],
129
+ )
130
+
131
+ # Výstup
132
+ col2.subheader("Výstup")
133
+ if st_operator != "zvýraznění":
134
+ st_anonymize_results = anonymize(
135
+ text=st_text,
136
+ operator=st_operator,
137
+ mask_char=st_mask_char,
138
+ number_of_chars=st_number_of_chars,
139
+ analyze_results=st_analyze_results,
140
+ )
141
+ col2.text_area(
142
+ label="Anonymizovaný text", value=st_anonymize_results.text, height=400
143
+ )
 
 
 
 
 
 
 
 
 
 
 
144
  else:
 
145
  annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
 
146
  annotated_text(*annotated_tokens)
147
 
148
+ # Tabulka s výsledky
149
+ st.subheader("Nalezené osobní údaje")
 
 
 
 
150
  if st_analyze_results:
151
  df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
152
  df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
153
 
154
  df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
155
  {
156
+ "entity_type": "Typ entity",
157
  "text": "Text",
158
+ "start": "Začátek",
159
+ "end": "Konec",
160
+ "score": "Důvěryhodnost",
161
  },
162
  axis=1,
163
  )
 
 
 
 
 
 
164
  st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
165
  else:
166
+ st.text("Žádné osobní údaje nebyly nalezeny.")
167
 
168
  except Exception as e:
169
  print(e)
170
  traceback.print_exc()
171
+ st.error(f"Došlo k chybě: {str(e)}")
172
+
173
+ # Informace o aplikaci
174
+ st.sidebar.markdown("---")
175
+ st.sidebar.subheader("O aplikaci")
176
+ st.sidebar.info(
177
+ "Tato aplikace anonymizuje osobní údaje v českých textech. "
178
+ "Využívá Microsoft Presidio a pokročilé NLP techniky pro detekci a anonymizaci PII."
179
+ )