Skip to content

Commit d0e2360

Browse files
committed
Add anomaly-detection example
1 parent 049ec55 commit d0e2360

File tree

5 files changed

+1095
-0
lines changed

5 files changed

+1095
-0
lines changed

anomaly-detection/lib/__init__.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""
2+
Process Anomaly Detection - Constants and Utilities
3+
"""
4+
5+
import logging
6+
7+
logger = logging.getLogger(__name__)
8+
MAX_SYSCALLS = 548
9+
10+
11+
def comm_for_pid(pid: int) -> bytes | None:
12+
"""Get process name from /proc."""
13+
try:
14+
with open(f"/proc/{pid}/comm", "rb") as f:
15+
return f.read().strip()
16+
except FileNotFoundError:
17+
logger.warning(f"Process with PID {pid} not found.")
18+
except PermissionError:
19+
logger.warning(f"Permission denied when accessing /proc/{pid}/comm.")
20+
except Exception as e:
21+
logger.warning(f"Error reading /proc/{pid}/comm: {e}")
22+
return None

anomaly-detection/lib/ml.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
"""
2+
Autoencoder for Process Behavior Anomaly Detection
3+
4+
Uses Keras/TensorFlow to train an autoencoder on syscall patterns.
5+
Anomalies are detected when reconstruction error exceeds threshold.
6+
"""
7+
8+
import logging
9+
import os
10+
11+
import numpy as np
12+
import pandas as pd
13+
from sklearn.model_selection import train_test_split
14+
from tensorflow import keras
15+
16+
from lib import MAX_SYSCALLS
17+
18+
logger = logging.getLogger(__name__)
19+
20+
21+
def create_autoencoder(n_inputs: int = MAX_SYSCALLS) -> keras.Model:
22+
"""
23+
Create the autoencoder architecture.
24+
25+
Architecture: input → encoder → bottleneck → decoder → output
26+
"""
27+
inp = keras.Input(shape=(n_inputs,))
28+
29+
# Encoder
30+
encoder = keras.layers.Dense(n_inputs)(inp)
31+
encoder = keras.layers.ReLU()(encoder)
32+
33+
# Bottleneck (compressed representation)
34+
bottleneck = keras.layers.Dense(n_inputs // 2)(encoder)
35+
36+
# Decoder
37+
decoder = keras.layers.Dense(n_inputs)(bottleneck)
38+
decoder = keras.layers.ReLU()(decoder)
39+
output = keras.layers.Dense(n_inputs, activation="linear")(decoder)
40+
41+
model = keras.Model(inp, output)
42+
model.compile(optimizer="adam", loss="mse")
43+
44+
return model
45+
46+
47+
class AutoEncoder:
48+
"""
49+
Autoencoder for syscall pattern anomaly detection.
50+
51+
Usage:
52+
# Training
53+
ae = AutoEncoder('model.keras')
54+
model, threshold = ae.train('data.csv', epochs=200)
55+
56+
# Inference
57+
ae = AutoEncoder('model.keras', load=True)
58+
_, errors, total_error = ae.predict([features])
59+
"""
60+
61+
def __init__(self, filename: str, load: bool = False):
62+
self.filename = filename
63+
self.model = None
64+
65+
if load:
66+
self._load_model()
67+
68+
def _load_model(self) -> None:
69+
"""Load a trained model from disk."""
70+
if not os.path.exists(self.filename):
71+
raise FileNotFoundError(f"Model file not found: {self.filename}")
72+
73+
logger.info(f"Loading model from {self.filename}")
74+
self.model = keras.models.load_model(self.filename)
75+
76+
def train(
77+
self,
78+
datafile: str,
79+
epochs: int,
80+
batch_size: int,
81+
test_size: float = 0.1,
82+
) -> tuple[keras.Model, float]:
83+
"""
84+
Train the autoencoder on collected data.
85+
86+
Args:
87+
datafile: Path to CSV file with training data
88+
epochs: Number of training epochs
89+
batch_size: Training batch size
90+
test_size: Fraction of data to use for validation
91+
92+
Returns:
93+
Tuple of (trained model, error threshold)
94+
"""
95+
if not os.path.exists(datafile):
96+
raise FileNotFoundError(f"Data file not found: {datafile}")
97+
98+
logger.info(f"Loading training data from {datafile}")
99+
100+
# Load and prepare data
101+
df = pd.read_csv(datafile)
102+
features = df.drop(["sample_time"], axis=1).values
103+
104+
logger.info(f"Loaded {len(features)} samples with {features.shape[1]} features")
105+
106+
# Split train/test
107+
train_data, test_data = train_test_split(
108+
features,
109+
test_size=test_size,
110+
random_state=42,
111+
)
112+
113+
logger.info(f"Training set: {len(train_data)} samples")
114+
logger.info(f"Test set: {len(test_data)} samples")
115+
116+
# Create and train model
117+
self.model = create_autoencoder()
118+
119+
if self.model is None:
120+
raise RuntimeError("Failed to create the autoencoder model.")
121+
122+
logger.info("Training autoencoder...")
123+
self.model.fit(
124+
train_data,
125+
train_data,
126+
validation_data=(test_data, test_data),
127+
epochs=epochs,
128+
batch_size=batch_size,
129+
verbose=1,
130+
)
131+
132+
# Save model (use .keras format for Keras 3.x compatibility)
133+
self.model.save(self.filename)
134+
logger.info(f"Model saved to {self.filename}")
135+
136+
# Calculate error threshold from test data
137+
threshold = self._calculate_threshold(test_data)
138+
139+
return self.model, threshold
140+
141+
def _calculate_threshold(self, test_data: np.ndarray) -> float:
142+
"""Calculate error threshold from test data."""
143+
logger.info(f"Calculating error threshold from {len(test_data)} test samples")
144+
145+
if self.model is None:
146+
raise RuntimeError("Model not loaded. Use load=True or train first.")
147+
148+
predictions = self.model.predict(test_data, verbose=0)
149+
errors = np.abs(test_data - predictions).sum(axis=1)
150+
151+
return float(errors.max())
152+
153+
def predict(self, X: list | np.ndarray) -> tuple[np.ndarray, np.ndarray, float]:
154+
"""
155+
Run prediction and return reconstruction error.
156+
157+
Args:
158+
X: Input data (list of feature vectors)
159+
160+
Returns:
161+
Tuple of (reconstructed, per_feature_errors, total_error)
162+
"""
163+
if self.model is None:
164+
raise RuntimeError("Model not loaded. Use load=True or train first.")
165+
166+
X = np.asarray(X, dtype=np.float32)
167+
y = self.model.predict(X, verbose=0)
168+
169+
# Per-feature reconstruction error
170+
errors = np.abs(X[0] - y[0])
171+
total_error = float(errors.sum())
172+
173+
return y, errors, total_error

0 commit comments

Comments
 (0)