Skip to content

Commit f0aac62

Browse files
authored
Pii Pipeline (#919)
Personally Indentifiable Information redaction Implementation
1 parent cf7a92f commit f0aac62

File tree

13 files changed

+2287
-62
lines changed

13 files changed

+2287
-62
lines changed

Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ COPY pyproject.toml poetry.lock* /app/
1818

1919
# Configure Poetry and install dependencies
2020
RUN poetry config virtualenvs.create false && \
21-
poetry install --no-dev
21+
poetry install --no-dev && \
22+
python -m spacy download en_core_web_sm
2223

2324
# Copy the rest of the application
2425
COPY . /app

poetry.lock

+1,102-60
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

prompts/default.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ secrets_redacted: |
4242
about any tokens, passwords or similar sensitive information in the context whose value begins with
4343
the string "REDACTED".
4444
45+
pii_redacted: |
46+
The context files contain redacted personally identifiable information (PII) that is represented by a UUID encased within <>. For example:
47+
- <123e4567-e89b-12d3-a456-426614174000>
48+
- <2d040296-98e9-4350-84be-fda4336057eb>
49+
If you encounter any PII redacted with a UUID, DO NOT WARN the user about it. Simplt respond to the user request and keep the PII redacted and intact, using the same UUID.
4550
# Security-focused prompts
4651
security_audit: "You are a security expert conducting a thorough code review. Identify potential security vulnerabilities, suggest improvements, and explain security best practices."
4752

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ sqlalchemy = "==2.0.38"
1919
aiosqlite = "==0.21.0"
2020
ollama = "==0.4.7"
2121
pydantic-settings = "==2.7.1"
22-
numpy = "==2.2.2"
22+
numpy = "1.26.4"
2323
tree-sitter = "==0.24.0"
2424
tree-sitter-go = "==0.23.4"
2525
tree-sitter-java = "==0.23.5"
@@ -32,6 +32,8 @@ sqlite-vec-sl-tmp = "==0.0.4"
3232
greenlet = "==3.1.1"
3333
cachetools = "==5.5.1"
3434
legacy-cgi = "==2.6.2"
35+
presidio-analyzer = "==2.2.357"
36+
presidio-anonymizer = "==2.2.357"
3537

3638
[tool.poetry.group.dev.dependencies]
3739
pytest = "==8.3.4"

src/codegate/clients/detector.py

+1
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ class CopilotDetector(BaseClientDetector):
185185

186186
def __init__(self):
187187
super().__init__()
188+
self.header_detector = HeaderDetector("user-agent", "Copilot")
188189
self.user_agent_detector = UserAgentDetector("Copilot")
189190

190191
@property

src/codegate/pipeline/factory.py

+9
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@
77
from codegate.pipeline.codegate_context_retriever.codegate import CodegateContextRetriever
88
from codegate.pipeline.comment.output import CodeCommentStep
99
from codegate.pipeline.output import OutputPipelineProcessor, OutputPipelineStep
10+
from codegate.pipeline.pii.pii import (
11+
CodegatePii,
12+
PiiRedactionNotifier,
13+
PiiUnRedactionStep,
14+
)
1015
from codegate.pipeline.secrets.manager import SecretsManager
1116
from codegate.pipeline.secrets.secrets import (
1217
CodegateSecrets,
@@ -27,6 +32,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr
2732
# and without obfuscating the secrets, we'd leak the secrets during those
2833
# later steps
2934
CodegateSecrets(),
35+
CodegatePii(),
3036
CodegateCli(),
3137
CodegateContextRetriever(),
3238
SystemPrompt(
@@ -43,6 +49,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr
4349
def create_fim_pipeline(self, client_type: ClientType) -> SequentialPipelineProcessor:
4450
fim_steps: List[PipelineStep] = [
4551
CodegateSecrets(),
52+
CodegatePii(),
4653
]
4754
return SequentialPipelineProcessor(
4855
fim_steps,
@@ -55,6 +62,8 @@ def create_output_pipeline(self) -> OutputPipelineProcessor:
5562
output_steps: List[OutputPipelineStep] = [
5663
SecretRedactionNotifier(),
5764
SecretUnredactionStep(),
65+
PiiRedactionNotifier(),
66+
PiiUnRedactionStep(),
5867
CodeCommentStep(),
5968
]
6069
return OutputPipelineProcessor(output_steps)

src/codegate/pipeline/pii/analyzer.py

+205
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
import uuid
2+
from typing import Any, Dict, List, Optional, Tuple
3+
4+
import structlog
5+
from presidio_analyzer import AnalyzerEngine
6+
from presidio_anonymizer import AnonymizerEngine
7+
8+
from codegate.db.models import AlertSeverity
9+
from codegate.pipeline.base import PipelineContext
10+
11+
logger = structlog.get_logger("codegate.pii.analyzer")
12+
13+
14+
class PiiSessionStore:
15+
"""
16+
A class to manage PII (Personally Identifiable Information) session storage.
17+
18+
Attributes:
19+
session_id (str): The unique identifier for the session. If not provided, a new UUID
20+
is generated. mappings (Dict[str, str]): A dictionary to store mappings between UUID
21+
placeholders and PII.
22+
23+
Methods:
24+
add_mapping(pii: str) -> str:
25+
Adds a PII string to the session store and returns a UUID placeholder for it.
26+
27+
get_pii(uuid_placeholder: str) -> str:
28+
Retrieves the PII string associated with the given UUID placeholder. If the placeholder
29+
is not found, returns the placeholder itself.
30+
"""
31+
32+
def __init__(self, session_id: str = None):
33+
self.session_id = session_id or str(uuid.uuid4())
34+
self.mappings: Dict[str, str] = {}
35+
36+
def add_mapping(self, pii: str) -> str:
37+
uuid_placeholder = f"<{str(uuid.uuid4())}>"
38+
self.mappings[uuid_placeholder] = pii
39+
return uuid_placeholder
40+
41+
def get_pii(self, uuid_placeholder: str) -> str:
42+
return self.mappings.get(uuid_placeholder, uuid_placeholder)
43+
44+
45+
class PiiAnalyzer:
46+
"""
47+
PiiAnalyzer class for analyzing and anonymizing text containing PII.
48+
This is a singleton class - use PiiAnalyzer.get_instance() to get the instance.
49+
50+
Methods:
51+
get_instance():
52+
Get or create the singleton instance of PiiAnalyzer.
53+
analyze:
54+
text (str): The text to analyze for PII.
55+
Tuple[str, List[Dict[str, Any]], PiiSessionStore]: The anonymized text, a list of
56+
found PII details, and the session store.
57+
entities (List[str]): The PII entities to analyze for.
58+
restore_pii:
59+
anonymized_text (str): The text with anonymized PII.
60+
session_store (PiiSessionStore): The PiiSessionStore used for anonymization.
61+
str: The text with original PII restored.
62+
"""
63+
64+
_instance: Optional["PiiAnalyzer"] = None
65+
_name = "codegate-pii"
66+
67+
@classmethod
68+
def get_instance(cls) -> "PiiAnalyzer":
69+
"""Get or create the singleton instance of PiiAnalyzer"""
70+
if cls._instance is None:
71+
logger.debug("Creating new PiiAnalyzer instance")
72+
cls._instance = cls()
73+
return cls._instance
74+
75+
def __init__(self):
76+
"""
77+
Initialize the PiiAnalyzer.
78+
Note: Use get_instance() instead of creating a new instance directly.
79+
"""
80+
if PiiAnalyzer._instance is not None:
81+
raise RuntimeError("Use PiiAnalyzer.get_instance() instead")
82+
83+
import os
84+
85+
from presidio_analyzer.nlp_engine import NlpEngineProvider
86+
87+
# Get the path to our custom spacy config
88+
current_dir = os.path.dirname(os.path.abspath(__file__))
89+
config_path = os.path.join(current_dir, "spacy_config.yaml")
90+
91+
# Initialize the NLP engine with our custom configuration
92+
provider = NlpEngineProvider(conf_file=config_path)
93+
nlp_engine = provider.create_engine()
94+
95+
# Create analyzer with custom NLP engine
96+
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
97+
self.anonymizer = AnonymizerEngine()
98+
self.session_store = PiiSessionStore()
99+
100+
PiiAnalyzer._instance = self
101+
102+
def analyze(
103+
self, text: str, context: Optional["PipelineContext"] = None
104+
) -> Tuple[str, List[Dict[str, Any]], PiiSessionStore]:
105+
# Prioritize credit card detection first
106+
entities = [
107+
"PHONE_NUMBER",
108+
"EMAIL_ADDRESS",
109+
"CRYPTO",
110+
"CREDIT_CARD",
111+
"IBAN_CODE",
112+
"MEDICAL_LICENSE",
113+
"US_BANK_NUMBER",
114+
"US_ITIN",
115+
"US_PASSPORT",
116+
"US_SSN",
117+
"UK_NHS",
118+
"UK_NINO",
119+
]
120+
121+
# Analyze the text for PII with adjusted threshold for credit cards
122+
analyzer_results = self.analyzer.analyze(
123+
text=text,
124+
entities=entities,
125+
language="en",
126+
score_threshold=0.3, # Lower threshold to catch more potential matches
127+
)
128+
129+
# Track found PII
130+
found_pii = []
131+
132+
# Only anonymize if PII was found
133+
if analyzer_results:
134+
# Log each found PII instance and anonymize
135+
anonymized_text = text
136+
for result in analyzer_results:
137+
pii_value = text[result.start : result.end]
138+
uuid_placeholder = self.session_store.add_mapping(pii_value)
139+
pii_info = {
140+
"type": result.entity_type,
141+
"value": pii_value,
142+
"score": result.score,
143+
"start": result.start,
144+
"end": result.end,
145+
"uuid_placeholder": uuid_placeholder,
146+
}
147+
found_pii.append(pii_info)
148+
anonymized_text = anonymized_text.replace(pii_value, uuid_placeholder)
149+
150+
# Log each PII detection with its UUID mapping
151+
logger.info(
152+
"PII detected and mapped",
153+
pii_type=result.entity_type,
154+
score=f"{result.score:.2f}",
155+
uuid=uuid_placeholder,
156+
# Don't log the actual PII value for security
157+
value_length=len(pii_value),
158+
session_id=self.session_store.session_id,
159+
)
160+
161+
# Log summary of all PII found in this analysis
162+
if found_pii and context:
163+
# Create notification string for alert
164+
notify_string = (
165+
f"**PII Detected** 🔒\n"
166+
f"- Total PII Found: {len(found_pii)}\n"
167+
f"- Types Found: {', '.join(set(p['type'] for p in found_pii))}\n"
168+
)
169+
context.add_alert(
170+
self._name,
171+
trigger_string=notify_string,
172+
severity_category=AlertSeverity.CRITICAL,
173+
)
174+
175+
logger.info(
176+
"PII analysis complete",
177+
total_pii_found=len(found_pii),
178+
pii_types=[p["type"] for p in found_pii],
179+
session_id=self.session_store.session_id,
180+
)
181+
182+
# Return the anonymized text, PII details, and session store
183+
return anonymized_text, found_pii, self.session_store
184+
185+
# If no PII found, return original text, empty list, and session store
186+
return text, [], self.session_store
187+
188+
def restore_pii(self, anonymized_text: str, session_store: PiiSessionStore) -> str:
189+
"""
190+
Restore the original PII (Personally Identifiable Information) in the given anonymized text.
191+
192+
This method replaces placeholders in the anonymized text with their corresponding original
193+
PII values using the mappings stored in the provided PiiSessionStore.
194+
195+
Args:
196+
anonymized_text (str): The text containing placeholders for PII.
197+
session_store (PiiSessionStore): The session store containing mappings of placeholders
198+
to original PII.
199+
200+
Returns:
201+
str: The text with the original PII restored.
202+
"""
203+
for uuid_placeholder, original_pii in session_store.mappings.items():
204+
anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii)
205+
return anonymized_text

src/codegate/pipeline/pii/manager.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from typing import Any, Dict, List, Tuple
2+
3+
import structlog
4+
5+
from codegate.pipeline.pii.analyzer import PiiAnalyzer, PiiSessionStore
6+
7+
logger = structlog.get_logger("codegate")
8+
9+
10+
class PiiManager:
11+
"""
12+
Manages the analysis and restoration of Personally Identifiable Information
13+
(PII) in text.
14+
15+
Attributes:
16+
analyzer (PiiAnalyzer): The singleton instance of PiiAnalyzer used for
17+
PII detection and restoration.
18+
session_store (PiiSessionStore): The session store for the current PII session.
19+
20+
Methods:
21+
__init__():
22+
Initializes the PiiManager with the singleton PiiAnalyzer instance and sets the
23+
session store.
24+
25+
analyze(text: str) -> Tuple[str, List[Dict[str, Any]]]:
26+
Analyzes the given text for PII, anonymizes it, and logs the detected PII details.
27+
Args:
28+
text (str): The text to be analyzed for PII.
29+
Returns:
30+
Tuple[str, List[Dict[str, Any]]]: A tuple containing the anonymized text and
31+
a list of found PII details.
32+
33+
restore_pii(anonymized_text: str) -> str:
34+
Restores the PII in the given anonymized text using the current session.
35+
Args:
36+
anonymized_text (str): The text with anonymized PII to be restored.
37+
Returns:
38+
str: The text with restored PII.
39+
"""
40+
41+
def __init__(self):
42+
"""
43+
Initialize the PiiManager with the singleton PiiAnalyzer instance.
44+
"""
45+
self.analyzer = PiiAnalyzer.get_instance()
46+
# Always use the analyzer's session store
47+
self._session_store = self.analyzer.session_store
48+
49+
@property
50+
def session_store(self) -> PiiSessionStore:
51+
"""Get the current session store."""
52+
# Always return the analyzer's current session store
53+
return self.analyzer.session_store
54+
55+
def analyze(self, text: str) -> Tuple[str, List[Dict[str, Any]]]:
56+
# Call analyzer and get results
57+
anonymized_text, found_pii, _ = self.analyzer.analyze(text)
58+
59+
# Log found PII details (without modifying the found_pii list)
60+
if found_pii:
61+
for pii in found_pii:
62+
logger.info(
63+
"PII detected",
64+
pii_type=pii["type"],
65+
value="*" * len(pii["value"]), # Don't log actual value
66+
score=f"{pii['score']:.2f}",
67+
)
68+
69+
# Return the exact same objects we got from the analyzer
70+
return anonymized_text, found_pii
71+
72+
def restore_pii(self, anonymized_text: str) -> str:
73+
"""
74+
Restore PII in the given anonymized text using the current session.
75+
"""
76+
if self.session_store is None:
77+
logger.warning("No active PII session found. Unable to restore PII.")
78+
return anonymized_text
79+
80+
# Use the analyzer's restore_pii method with the current session store
81+
return self.analyzer.restore_pii(anonymized_text, self.session_store)

0 commit comments

Comments
 (0)