Spaces:
Running
Running
petrsovadina
commited on
Commit
•
d7e23a0
1
Parent(s):
aa4eed8
Update presidio_nlp_engine_config.py
Browse files- presidio_nlp_engine_config.py +14 -33
presidio_nlp_engine_config.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import logging
|
2 |
from typing import Tuple
|
|
|
3 |
import spacy
|
4 |
from presidio_analyzer import RecognizerRegistry
|
5 |
from presidio_analyzer.nlp_engine import (
|
@@ -8,6 +9,7 @@ from presidio_analyzer.nlp_engine import (
|
|
8 |
)
|
9 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
10 |
from presidio_analyzer.nlp_engine import TransformersNlpEngine
|
|
|
11 |
|
12 |
logger = logging.getLogger("presidio-streamlit")
|
13 |
|
@@ -21,7 +23,7 @@ def create_nlp_engine_with_spacy(
|
|
21 |
nlp = spacy.load(model_path)
|
22 |
nlp_configuration = {
|
23 |
"nlp_engine_name": "spacy",
|
24 |
-
"models": [{"lang_code": "
|
25 |
"ner_model_configuration": {
|
26 |
"model_to_presidio_entity_mapping": {
|
27 |
"PER": "PERSON",
|
@@ -56,37 +58,16 @@ def create_nlp_engine_with_transformers(
|
|
56 |
"""
|
57 |
print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
"ORGANIZATION": "ORGANIZATION",
|
71 |
-
# Add more mappings as needed
|
72 |
-
}
|
73 |
-
|
74 |
-
registry = RecognizerRegistry()
|
75 |
-
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
76 |
-
|
77 |
-
# You might want to add custom recognizers for this model
|
78 |
-
# For example:
|
79 |
-
# from presidio_analyzer import EntityRecognizer
|
80 |
-
# custom_recognizer = EntityRecognizer(supported_entities=["PERSON", "LOCATION", "ORGANIZATION"])
|
81 |
-
# registry.add_recognizer(custom_recognizer)
|
82 |
-
|
83 |
-
else:
|
84 |
-
# Default configuration for other transformer models
|
85 |
-
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
86 |
-
model = AutoModelForTokenClassification.from_pretrained(model_path)
|
87 |
-
|
88 |
-
nlp_engine = TransformersNlpEngine(tokenizer=tokenizer, model=model, device="cpu")
|
89 |
-
registry = RecognizerRegistry()
|
90 |
-
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
91 |
|
92 |
return nlp_engine, registry
|
|
|
1 |
import logging
|
2 |
from typing import Tuple
|
3 |
+
import os
|
4 |
import spacy
|
5 |
from presidio_analyzer import RecognizerRegistry
|
6 |
from presidio_analyzer.nlp_engine import (
|
|
|
9 |
)
|
10 |
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
11 |
from presidio_analyzer.nlp_engine import TransformersNlpEngine
|
12 |
+
from huggingface_hub import login
|
13 |
|
14 |
logger = logging.getLogger("presidio-streamlit")
|
15 |
|
|
|
23 |
nlp = spacy.load(model_path)
|
24 |
nlp_configuration = {
|
25 |
"nlp_engine_name": "spacy",
|
26 |
+
"models": [{"lang_code": "cs", "model_name": model_path}],
|
27 |
"ner_model_configuration": {
|
28 |
"model_to_presidio_entity_mapping": {
|
29 |
"PER": "PERSON",
|
|
|
58 |
"""
|
59 |
print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
|
60 |
|
61 |
+
hf_token = os.getenv("HUGGING_FACE_TOKEN")
|
62 |
+
if hf_token:
|
63 |
+
login(hf_token)
|
64 |
+
|
65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
66 |
+
model = AutoModelForTokenClassification.from_pretrained(model_path)
|
67 |
+
|
68 |
+
nlp_engine = TransformersNlpEngine(tokenizer=tokenizer, model=model, device="cpu")
|
69 |
+
|
70 |
+
registry = RecognizerRegistry()
|
71 |
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
return nlp_engine, registry
|