GDPR

Running

App Files Files Community

petrsovadina commited on 3 days ago

Commit

f7e1e6e

•

1 Parent(s): 6b6076c

Update presidio_streamlit.py

Browse files

Files changed (1) hide show

presidio_streamlit.py +76 -272

presidio_streamlit.py CHANGED Viewed

@@ -1,12 +1,10 @@
-"""Streamlit app for Presidio."""
 import logging
 import os
 import traceback
-import dotenv
 import pandas as pd
 import streamlit as st
-import streamlit.components.v1 as components
 from annotated_text import annotated_text
 from streamlit_tags import st_tags
@@ -21,7 +19,7 @@ from presidio_helpers import (
 )
 st.set_page_config(
-    page_title="Presidio demo",
     layout="wide",
     initial_sidebar_state="expanded",
     menu_items={
@@ -29,347 +27,153 @@ st.set_page_config(
     },
 )
-dotenv.load_dotenv()
 logger = logging.getLogger("presidio-streamlit")
-allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
 # Sidebar
-st.sidebar.header(
-    """
-PII De-Identification with [Microsoft Presidio](https://microsoft.github.io/presidio/)
-"""
-)
 model_help_text = """
-    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
-    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
-    as well as service such as Azure Text Analytics PII.
     """
-st_ta_key = st_ta_endpoint = ""
 model_list = [
-    "spaCy/en_core_web_lg",
     "iiiorg/piiranha-v1-detect-personal-information",
-    "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
 ]
-if not allow_other_models:
-    model_list.pop()
-# Select model
 st_model = st.sidebar.selectbox(
-    "NER model package",
     model_list,
-    index=2,
     help=model_help_text,
 )
-# Extract model package.
 st_model_package = st_model.split("/")[0]
-# Remove package prefix (if needed)
-st_model = (
-    st_model
-    if st_model_package.lower() not in ("spacy","piiiranha")
-    else "/".join(st_model.split("/")[1:])
-)
-if st_model == "Other":
-    st_model_package = st.sidebar.selectbox(
-        "NER model OSS package", options=["spacy","piiiranha"]
-    )
-    st_model = st.sidebar.text_input(f"NER model name", value="")
-st.sidebar.warning("Note: Models might take some time to download. ")
-analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
 logger.debug(f"analyzer_params: {analyzer_params}")
 st_operator = st.sidebar.selectbox(
-    "De-identification approach",
-    ["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"],
-    index=1,
     help="""
-    Select which manipulation to the text is requested after PII has been identified.\n
-    - Redact: Completely remove the PII text\n
-    - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
-    - Synthesize: Replace with fake values (requires an OpenAI key)\n
-    - Highlight: Shows the original text with PII highlighted in colors\n
-    - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
-    - Hash: Replaces with the hash of the PII string\n
-    - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
-         """,
 )
-st_mask_char = "*"
-st_number_of_chars = 15
-st_encrypt_key = "WmZq4t7w!z%C&F)J"
-open_ai_params = None
-logger.debug(f"st_operator: {st_operator}")
-def set_up_openai_synthesis():
-    """Set up the OpenAI API key and model for text synthesis."""
-    if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
-        openai_api_type = "azure"
-        st_openai_api_base = st.sidebar.text_input(
-            "Azure OpenAI base URL",
-            value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
-        )
-        openai_key = os.getenv("AZURE_OPENAI_KEY", default="")
-        st_deployment_id = st.sidebar.text_input(
-            "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
-        )
-        st_openai_version = st.sidebar.text_input(
-            "OpenAI version",
-            value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
-        )
-    else:
-        openai_api_type = "openai"
-        st_openai_version = st_openai_api_base = None
-        st_deployment_id = ""
-        openai_key = os.getenv("OPENAI_KEY", default="")
-    st_openai_key = st.sidebar.text_input(
-        "OPENAI_KEY",
-        value=openai_key,
-        help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
-        type="password",
-    )
-    st_openai_model = st.sidebar.text_input(
-        "OpenAI model for text synthesis",
-        value=os.getenv("OPENAI_MODEL", default="gpt-3.5-turbo-instruct"),
-        help="See more here: https://platform.openai.com/docs/models/",
-    )
-    return (
-        openai_api_type,
-        st_openai_api_base,
-        st_deployment_id,
-        st_openai_version,
-        st_openai_key,
-        st_openai_model,
-    )
-if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
-        "number of chars", value=st_number_of_chars, min_value=0, max_value=100
     )
     st_mask_char = st.sidebar.text_input(
-        "Mask character", value=st_mask_char, max_chars=1
-    )
-elif st_operator == "encrypt":
-    st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
-elif st_operator == "synthesize":
-    (
-        openai_api_type,
-        st_openai_api_base,
-        st_deployment_id,
-        st_openai_version,
-        st_openai_key,
-        st_openai_model,
-    ) = set_up_openai_synthesis()
-    open_ai_params = OpenAIParams(
-        openai_key=st_openai_key,
-        model=st_openai_model,
-        api_base=st_openai_api_base,
-        deployment_id=st_deployment_id,
-        api_version=st_openai_version,
-        api_type=openai_api_type,
     )
 st_threshold = st.sidebar.slider(
-    label="Acceptance threshold",
     min_value=0.0,
     max_value=1.0,
     value=0.35,
-    help="Define the threshold for accepting a detection as PII. See more here: ",
-)
-st_return_decision_process = st.sidebar.checkbox(
-    "Add analysis explanations to findings",
-    value=False,
-    help="Add the decision process to the output table. "
-    "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
 )
-# Allow and deny lists
-st_deny_allow_expander = st.sidebar.expander(
-    "Allowlists and denylists",
-    expanded=False,
-)
-with st_deny_allow_expander:
-    st_allow_list = st_tags(
-        label="Add words to the allowlist", text="Enter word and press enter."
-    )
-    st.caption(
-        "Allowlists contain words that are not considered PII, but are detected as such."
-    )
-    st_deny_list = st_tags(
-        label="Add words to the denylist", text="Enter word and press enter."
-    )
-    st.caption(
-        "Denylists contain words that are considered PII, but are not detected as such."
-    )
-# Main panel
-with st.expander("About this demo", expanded=False):
-    st.info(
-        """Presidio is an open source customizable framework for PII detection and de-identification.
-        \n\n[Code](https://aka.ms/presidio) |
-        [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
-        [Installation](https://microsoft.github.io/presidio/installation/) |
-        [FAQ](https://microsoft.github.io/presidio/faq/) |
-        [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
-    )
-    st.info(
-        """
-    Use this demo to:
-    - Experiment with different off-the-shelf models and NLP packages.
-    - Explore the different de-identification options, including redaction, masking, encryption and more.
-    - Generate synthetic text with Microsoft Presidio and OpenAI.
-    - Configure allow and deny lists.
-    This demo website shows some of Presidio's capabilities.
-    [Visit our website](https://microsoft.github.io/presidio) for more info,
-    samples and deployment options.
-    """
-    )
-    st.markdown(
-        "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"  # noqa
-        "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
-        "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
-    )
-analyzer_load_state = st.info("Starting Presidio analyzer...")
-analyzer_load_state.empty()
-# Read default text
-with open("demo_text.txt") as f:
-    demo_text = f.readlines()
-# Create two columns for before and after
 col1, col2 = st.columns(2)
-# Before:
-col1.subheader("Input")
 st_text = col1.text_area(
-    label="Enter text", value="".join(demo_text), height=400, key="text_input"
 )
 try:
-    # Choose entities
-    st_entities_expander = st.sidebar.expander("Choose entities to look for")
     st_entities = st_entities_expander.multiselect(
-        label="Which entities to look for?",
         options=get_supported_entities(*analyzer_params),
         default=list(get_supported_entities(*analyzer_params)),
-        help="Limit the list of PII entities detected. "
-        "This list is dynamic and based on the NER model and registered recognizers. "
-        "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
     )
-    # Before
-    analyzer_load_state = st.info("Starting Presidio analyzer...")
     analyzer = analyzer_engine(*analyzer_params)
     analyzer_load_state.empty()
     st_analyze_results = analyze(
         *analyzer_params,
         text=st_text,
         entities=st_entities,
-        language="en",
         score_threshold=st_threshold,
-        return_decision_process=st_return_decision_process,
-        allow_list=st_allow_list,
-        deny_list=st_deny_list,
-    )
-    # After
-    if st_operator not in ("highlight", "synthesize"):
-        with col2:
-            st.subheader(f"Output")
-            st_anonymize_results = anonymize(
-                text=st_text,
-                operator=st_operator,
-                mask_char=st_mask_char,
-                number_of_chars=st_number_of_chars,
-                encrypt_key=st_encrypt_key,
-                analyze_results=st_analyze_results,
-            )
-            st.text_area(
-                label="De-identified", value=st_anonymize_results.text, height=400
-            )
-    elif st_operator == "synthesize":
-        with col2:
-            st.subheader(f"OpenAI Generated output")
-            fake_data = create_fake_data(
-                st_text,
-                st_analyze_results,
-                open_ai_params,
-            )
-            st.text_area(label="Synthetic data", value=fake_data, height=400)
     else:
-        st.subheader("Highlighted")
         annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
-        # annotated_tokens
         annotated_text(*annotated_tokens)
-    # table result
-    st.subheader(
-        "Findings"
-        if not st_return_decision_process
-        else "Findings with decision factors"
-    )
     if st_analyze_results:
         df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
         df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
         df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
             {
-                "entity_type": "Entity type",
                 "text": "Text",
-                "start": "Start",
-                "end": "End",
-                "score": "Confidence",
             },
             axis=1,
         )
-        df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
-        if st_return_decision_process:
-            analysis_explanation_df = pd.DataFrame.from_records(
-                [r.analysis_explanation.to_dict() for r in st_analyze_results]
-            )
-            df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
         st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
     else:
-        st.text("No findings")
 except Exception as e:
     print(e)
     traceback.print_exc()
-    st.error(e)
-components.html(
-    """
-    <script type="text/javascript">
-    (function(c,l,a,r,i,t,y){
-        c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)};
-        t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i;
-        y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y);
-    })(window, document, "clarity", "script", "h7f8bp42n8");
-    </script>
-    """
-)

+"""Streamlit app pro anonymizaci českých textů s využitím Presidio."""
 import logging
 import os
 import traceback
 import pandas as pd
 import streamlit as st
 from annotated_text import annotated_text
 from streamlit_tags import st_tags
 )
 st.set_page_config(
+    page_title="Anonymizace českých textů",
     layout="wide",
     initial_sidebar_state="expanded",
     menu_items={
     },
 )
 logger = logging.getLogger("presidio-streamlit")
 # Sidebar
+st.sidebar.header("Anonymizace osobních údajů v českých textech")
 model_help_text = """
+    Vyberte model pro rozpoznávání pojmenovaných entit (NER) pro detekci osobních údajů.
     """
 model_list = [
     "iiiorg/piiranha-v1-detect-personal-information",
+    "spaCy/cs_core_news_sm",
 ]
 st_model = st.sidebar.selectbox(
+    "NER model",
     model_list,
+    index=0,
     help=model_help_text,
 )
 st_model_package = st_model.split("/")[0]
+st_model = "/".join(st_model.split("/")[1:])
+analyzer_params = (st_model_package, st_model, None, None)
 logger.debug(f"analyzer_params: {analyzer_params}")
 st_operator = st.sidebar.selectbox(
+    "Metoda anonymizace",
+    ["nahrazení", "maskování", "zvýraznění"],
+    index=0,
     help="""
+    Vyberte způsob anonymizace textu po identifikaci osobních údajů.\n
+    - Nahrazení: Nahradí osobní údaj obecným označením, např. <OSOBA>\n
+    - Maskování: Nahradí část osobního údaje hvězdičkami\n
+    - Zvýraznění: Zvýrazní osobní údaje v původním textu
+    """,
 )
+st_mask_char = "*"
+st_number_of_chars = 4
+if st_operator == "maskování":
     st_number_of_chars = st.sidebar.number_input(
+        "Počet znaků k maskování", value=st_number_of_chars, min_value=0, max_value=100
     )
     st_mask_char = st.sidebar.text_input(
+        "Znak pro maskování", value=st_mask_char, max_chars=1
     )
 st_threshold = st.sidebar.slider(
+    label="Práh přijetí",
     min_value=0.0,
     max_value=1.0,
     value=0.35,
+    help="Definujte práh pro přijetí detekce jako osobní údaj.",
 )
+# Hlavní panel
+st.title("Anonymizace českých textů")
+# Načtení ukázkového textu
+with open("demo_text.txt", "r", encoding="utf-8") as f:
+    demo_text = f.read()
+# Vytvoření dvou sloupců pro vstup a výstup
 col1, col2 = st.columns(2)
+# Vstup
+col1.subheader("Vstupní text")
 st_text = col1.text_area(
+    label="Zadejte text", value=demo_text, height=400, key="text_input"
 )
 try:
+    # Výběr entit
+    st_entities_expander = st.sidebar.expander("Vyberte entity k detekci")
     st_entities = st_entities_expander.multiselect(
+        label="Které entity hledat?",
         options=get_supported_entities(*analyzer_params),
         default=list(get_supported_entities(*analyzer_params)),
+        help="Omezte seznam detekovaných osobních údajů.",
     )
+    # Inicializace analyzátoru
+    analyzer_load_state = st.info("Spouštění Presidio analyzátoru...")
     analyzer = analyzer_engine(*analyzer_params)
     analyzer_load_state.empty()
+    # Analýza textu
     st_analyze_results = analyze(
         *analyzer_params,
         text=st_text,
         entities=st_entities,
+        language="cs",
         score_threshold=st_threshold,
+        return_decision_process=False,
+        allow_list=[],
+        deny_list=[],
+    )
+    # Výstup
+    col2.subheader("Výstup")
+    if st_operator != "zvýraznění":
+        st_anonymize_results = anonymize(
+            text=st_text,
+            operator=st_operator,
+            mask_char=st_mask_char,
+            number_of_chars=st_number_of_chars,
+            analyze_results=st_analyze_results,
+        )
+        col2.text_area(
+            label="Anonymizovaný text", value=st_anonymize_results.text, height=400
+        )
     else:
         annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
         annotated_text(*annotated_tokens)
+    # Tabulka s výsledky
+    st.subheader("Nalezené osobní údaje")
     if st_analyze_results:
         df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
         df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
         df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
             {
+                "entity_type": "Typ entity",
                 "text": "Text",
+                "start": "Začátek",
+                "end": "Konec",
+                "score": "Důvěryhodnost",
             },
             axis=1,
         )
         st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
     else:
+        st.text("Žádné osobní údaje nebyly nalezeny.")
 except Exception as e:
     print(e)
     traceback.print_exc()
+    st.error(f"Došlo k chybě: {str(e)}")
+# Informace o aplikaci
+st.sidebar.markdown("---")
+st.sidebar.subheader("O aplikaci")
+st.sidebar.info(
+    "Tato aplikace anonymizuje osobní údaje v českých textech. "
+    "Využívá Microsoft Presidio a pokročilé NLP techniky pro detekci a anonymizaci PII."
+)