{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyM1ED23mmqdh3VQ+6zffiai"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["**Create Custom PII Entity Recognizers**\n","\n","Presidio Analyzer comes with a pre-defined set of entity recognizers. It also allows adding new recognizers without changing the analyzer base code, by creating custom recognizers."],"metadata":{"id":"jCA19-6459WT"}},{"cell_type":"code","source":["# download presidio\n","!pip install presidio_analyzer presidio_anonymizer\n","!python -m spacy download en_core_web_lg\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QnyloYQJ-b5q","executionInfo":{"status":"ok","timestamp":1766271361539,"user_tz":300,"elapsed":19421,"user":{"displayName":"FirstLink Consulting","userId":"01838976358137567773"}},"outputId":"1733d6f0-27f7-4f4b-df0d-64baa6589ae7"},"execution_count":14,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: presidio_analyzer in /usr/local/lib/python3.12/dist-packages (2.2.360)\n","Requirement already satisfied: presidio_anonymizer in /usr/local/lib/python3.12/dist-packages (2.2.360)\n","Requirement already satisfied: phonenumbers<10.0.0,>=8.12 in /usr/local/lib/python3.12/dist-packages (from presidio_analyzer) (9.0.21)\n","Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (from presidio_analyzer) (6.0.3)\n","Requirement already satisfied: regex in /usr/local/lib/python3.12/dist-packages (from presidio_analyzer) (2025.11.3)\n","Requirement already satisfied: spacy!=3.7.0,>=3.4.4 in /usr/local/lib/python3.12/dist-packages (from presidio_analyzer) (3.8.11)\n","Requirement already satisfied: tldextract in /usr/local/lib/python3.12/dist-packages (from presidio_analyzer) (5.3.0)\n","Requirement already satisfied: cryptography<44.1 in /usr/local/lib/python3.12/dist-packages (from presidio_anonymizer) (43.0.3)\n","Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.12/dist-packages (from cryptography<44.1->presidio_anonymizer) (2.0.0)\n","Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (3.0.12)\n","Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (1.0.5)\n","Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (1.0.15)\n","Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.0.13)\n","Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (3.0.12)\n","Requirement already satisfied: thinc<8.4.0,>=8.3.4 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (8.3.10)\n","Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (1.1.3)\n","Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.5.2)\n","Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.0.10)\n","Requirement already satisfied: weasel<0.5.0,>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (0.4.3)\n","Requirement already satisfied: typer-slim<1.0.0,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (0.20.0)\n","Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (4.67.1)\n","Requirement already satisfied: numpy>=1.19.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.0.2)\n","Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.32.4)\n","Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.12.3)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (3.1.6)\n","Requirement already satisfied: setuptools in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (75.2.0)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.12/dist-packages (from spacy!=3.7.0,>=3.4.4->presidio_analyzer) (25.0)\n","Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from tldextract->presidio_analyzer) (3.11)\n","Requirement already satisfied: requests-file>=1.4 in /usr/local/lib/python3.12/dist-packages (from tldextract->presidio_analyzer) (3.0.1)\n","Requirement already satisfied: filelock>=3.0.8 in /usr/local/lib/python3.12/dist-packages (from tldextract->presidio_analyzer) (3.20.0)\n","Requirement already satisfied: pycparser in /usr/local/lib/python3.12/dist-packages (from cffi>=1.12->cryptography<44.1->presidio_anonymizer) (2.23)\n","Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (0.7.0)\n","Requirement already satisfied: pydantic-core==2.41.4 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.41.4)\n","Requirement already satisfied: typing-extensions>=4.14.1 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (4.15.0)\n","Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/lib/python3.12/dist-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (0.4.2)\n","Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (3.4.4)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.5.0)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests<3.0.0,>=2.13.0->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2025.11.12)\n","Requirement already satisfied: blis<1.4.0,>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from thinc<8.4.0,>=8.3.4->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (1.3.3)\n","Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.12/dist-packages (from thinc<8.4.0,>=8.3.4->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (0.1.5)\n","Requirement already satisfied: click>=8.0.0 in /usr/local/lib/python3.12/dist-packages (from typer-slim<1.0.0,>=0.3.0->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (8.3.1)\n","Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /usr/local/lib/python3.12/dist-packages (from weasel<0.5.0,>=0.4.2->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (0.23.0)\n","Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /usr/local/lib/python3.12/dist-packages (from weasel<0.5.0,>=0.4.2->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (7.5.0)\n","Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.12/dist-packages (from jinja2->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (3.0.3)\n","Requirement already satisfied: wrapt in /usr/local/lib/python3.12/dist-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.4.2->spacy!=3.7.0,>=3.4.4->presidio_analyzer) (2.0.1)\n","Collecting en-core-web-lg==3.8.0\n","  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)\n","\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m400.7/400.7 MB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n","You can now load the package via spacy.load('en_core_web_lg')\n","\u001b[38;5;3m⚠ Restart to reload dependencies\u001b[0m\n","If you are in a Jupyter or Colab notebook, you may need to restart Python in\n","order to load all the package's dependencies. You can do this by selecting the\n","'Restart kernel' or 'Restart runtime' option.\n"]}]},{"cell_type":"code","source":["from presidio_analyzer import AnalyzerEngine, PatternRecognizer"],"metadata":{"id":"rxx_HFrC-j-k","executionInfo":{"status":"ok","timestamp":1766271392615,"user_tz":300,"elapsed":5,"user":{"displayName":"FirstLink Consulting","userId":"01838976358137567773"}}},"execution_count":15,"outputs":[]},{"cell_type":"code","source":["text_to_recognizer = \"His name is Mr. Jones and his phone number is 212-555-5555\"\n","analyzer = AnalyzerEngine()\n","title_recognizer = PatternRecognizer(supported_entity=\"TITLE\", deny_list=[\"Mr.\", \"Mrs.\"])\n","analyzer.registry.add_recognizer(title_recognizer)\n","analyzer_results = analyzer.analyze(text=text_to_recognizer, entities=[\"TITLE\"], language=\"en\")\n","print(analyzer_results)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WFuFdyP9-qIC","executionInfo":{"status":"ok","timestamp":1766271550879,"user_tz":300,"elapsed":1699,"user":{"displayName":"FirstLink Consulting","userId":"01838976358137567773"}},"outputId":"0399c276-e474-47ac-e94b-37b0f5b15ca8"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stderr","text":["WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: es, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: it, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - CreditCardRecognizer supported languages: pl, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - EsNifRecognizer supported languages: es, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - EsNieRecognizer supported languages: es, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItDriverLicenseRecognizer supported languages: it, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItFiscalCodeRecognizer supported languages: it, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItVatCodeRecognizer supported languages: it, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItIdentityCardRecognizer supported languages: it, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - ItPassportRecognizer supported languages: it, registry supported languages: en\n","WARNING:presidio-analyzer:Recognizer not added to registry because language is not supported by registry - PlPeselRecognizer supported languages: pl, registry supported languages: en\n"]},{"output_type":"stream","name":"stdout","text":["[type: TITLE, start: 12, end: 15, score: 1.0]\n"]}]}]}