# Foundational code for TPOT

Exploring the foundations of the Genetic Programming (GP) library TPOT, which automates the process of selecting the best machine learning model and hyperparameters for a given dataset. This notebook demonstrates the following foundational concepts:

* Loading data from Elasticsearch
* Preparing nested data for the data pipeline
* Filtering out irrelevant information from traces (problem model)
* Vectorizing text data using BERT (uncased), for English language (semi-natural)
* Training and selecting a model with TPOT
* Evaluating the model and exporting the pipeline
* Visualizing the frequency of models tested by TPOT (GP internals)
* Loading the trained model and making predictions (todo)

In [None]:
import requests
import pandas as pd
import json

# Function to recursively normalize nested columns in a DataFrame
def recursively_normalize(data):
 df = pd.json_normalize(data)
 while True:
 nested_cols = [col for col in df.columns if isinstance(df[col].iloc[0], (dict, list))]
 if not nested_cols:
 break
 for col in nested_cols:
 if isinstance(df[col].iloc[0], dict):
 normalized = pd.json_normalize(df[col])
 df = df.drop(columns=[col]).join(normalized)
 elif isinstance(df[col].iloc[0], list):
 df = df.explode(col)
 normalized = pd.json_normalize(df[col])
 df = df.drop(columns=[col]).join(normalized)
 return df

# Function to fetch the next batch using the cursor from the Elastic API
def fetch_next_batch(cursor):
 response = requests.post(
 f"{base_url}/_sql?format=json",
 headers={"Content-Type": "application/json"},
 json={"cursor": cursor}
 ).json()
 return response

# Elasticsearch base URL
base_url = "http://192.168.20.106:9200"
# Index name
index = "winlogbeat-*"

from datetime import datetime, timedelta

# Calculate the current time and the time one hour ago
current_time = datetime.utcnow()
one_hour_ago = current_time - timedelta(hours=1)

# Format times in ISO8601 format as expected by Elasticsearch
current_time_iso = current_time.strftime('%Y-%m-%dT%H:%M:%SZ')
one_hour_ago_iso = one_hour_ago.strftime('%Y-%m-%dT%H:%M:%SZ')

# SQL query with time filter
sql_query = f"""
SELECT "@timestamp", host.hostname, host.ip, log.level, winlog.event_id, winlog.task, message
FROM "winlogbeat-7.10.0-2024.06.23-*"
WHERE host.hostname = 'win10'
AND winlog.provider_name = 'Microsoft-Windows-Sysmon'
AND "@timestamp" >= '{one_hour_ago_iso}'
AND "@timestamp" <= '{current_time_iso}'
"""

# Initial search request to start scrolling
initial_response = requests.post(
 f"{base_url}/_sql?format=json",
 headers={"Content-Type": "application/json"},
 json={
 "query": sql_query,
 "field_multi_value_leniency": True
 }
).json()

# Extract the cursor for scrolling
cursor = initial_response.get('cursor')
rows = initial_response.get('rows')
columns = [col['name'] for col in initial_response['columns']]

# Initialize CSV file (assumes the first batch is not empty)
if rows:
 df = pd.DataFrame(rows, columns=columns)
 df = recursively_normalize(df.to_dict(orient='records'))
 df.to_csv("lab_logs_blindtest_activity.csv", mode='w', index=False, header=True)

# Track total documents retrieved
total_documents_retrieved = len(rows)
print(f"Retrieved {total_documents_retrieved} documents.")

# Loop to fetch subsequent batches of documents until no more documents are left
while cursor:
 # Fetch next batch of documents using cursor
 response = fetch_next_batch(cursor)
 
 # Update cursor for the next batch
 cursor = response.get('cursor')
 rows = response.get('rows')
 
 # If no rows, break out of the loop
 if not rows:
 break
 
 # Normalize data and append to CSV
 df = pd.DataFrame(rows, columns=columns)
 df = recursively_normalize(df.to_dict(orient='records'))
 
 # Append to CSV file without headers
 df.to_csv("lab_logs_blindtest_activity.csv", mode='a', index=False, header=False)
 
 # Convert DataFrame to JSON, line by line
 json_lines = df.to_json(orient='records', lines=True).splitlines()
 # Append each line to an existing JSON file
 with open("lab_logs_blindtest_activity.json", 'a') as file:
 for line in json_lines:
 file.write(line + '\n') # Append each line and add a newline
 
 # Update total documents retrieved
 total_documents_retrieved += len(rows)
 
 print(f"Retrieved {total_documents_retrieved} documents.")

print("Files have been written.")

## Load data from a CSV file

Load the data from the CSV file into a DataFrame using Polars, a fast DataFrame library in Rust. This step is necessary to prepare the data for further processing and filtering.


In [None]:
import polars as pl

# Define the path to your CSV file
csv_file_path = 'lab_logs_blindtest_activity.csv'

# Load the CSV file into a DataFrame
df = pl.read_csv(csv_file_path)

# Show the DataFrame to confirm it's loaded correctly
print(df)


## Data filtering and transformation

Filter out irrelevant information from the traces to focus on the key details. This step involves removing specific lines based on keywords present at the start of the line. The goal is to clean up the data and make it more manageable for further processing.

In [None]:
def remove_keyword_lines(batch, keywords):
 def modify_line(line):
 # Check each keyword; filter the line if the keyword is at the start followed by a colon
 for keyword in keywords:
 if line.startswith(f"{keyword}:"):
 # Special handling for 'User' keyword
 if keyword == 'User':
 parts = line.split('\\')
 if len(parts) > 1:
 return f"User: {parts[1]}" # Only keep the part after the backslash
 elif keyword == 'SourceHostname':
 parts = line.split('.')
 if len(parts) > 0:
 return f"{keyword}: {parts[0].split(': ')[1]}" # Only keep the part before the first dot, remove keyword duplication
 return None # For other keywords, remove the line altogether
 return line # Return the line unchanged if no keyword conditions are met

 # Use map_elements to apply a function to each message in the batch
 return batch.map_elements(lambda message: '\n'.join(
 filter(None, (modify_line(line) for line in message.split('\n')))), 
 return_dtype=pl.Utf8)


# keywords to filter or process
keywords_to_filter = ["UtcTime", "SourceProcessGUID","ProcessGuid", "TargetProcessGUID", "TargetObject", "FileVersion", "Hashes", "LogonGuid", "LogonId", "CreationUtcTime", "User", "ParentProcessGuid", "SourceHostname"]


# Apply the transformation to the 'message' column using map_batches
df_f = df.with_columns(
 pl.col("message").map_batches(lambda batch: remove_keyword_lines(batch, keywords_to_filter), return_dtype=pl.Utf8).alias("filtered_message")
)

# Assuming df_f is your DataFrame with the 'filtered_message' column
# Fetch the first three rows from the 'filtered_message' column
first_messages = df_f["filtered_message"].head(200)

# Print each message completely
for i, message in enumerate(first_messages):
 print(f"Message {i+1}:")
 print(message)
 print("-" * 50) # Separator for readability


## Select specific columns and write to a CSV file

This is a data reduction approach where only the necessary columns are selected for further processing. The selected columns are then written to a new CSV file for use in subsequent steps.

In [None]:
# Assuming df_f is your modified DataFrame with all necessary columns including 'filtered_message'
# Select specific columns from the DataFrame
selected_columns_df = df_f.select(["log.level", "winlog.event_id", "winlog.task","filtered_message"])

# Write the selected columns to a CSV file
selected_columns_df.write_csv('lab_logs_blindtest_activity_filtered.csv')


In [None]:
selected_columns_df.head(5)

## Indexing and inserting a new column

The following code indexes the events in the dataframe and inserts the index as the first column. This step is essential for tracking the order of events and ensuring that the data remains organized throughout the process.

In [None]:
# Create an index series directly
index_series = pl.Series("index", range(selected_columns_df.height))

# Insert the index series as the first column using the recommended method
selected_columns_df = selected_columns_df.insert_column(0, index_series)

# Write the DataFrame to a CSV file, including the new index column
selected_columns_df.write_csv('lab_logs_blindtest_activity_filtered.csv')


## TPOT model training and evaluation

The following code demonstrates how to train a TPOT model using the data prepared in the previous steps. The model is trained on the vectorized text data and evaluated to determine its performance. The best model is then exported for future use.

### Install necessary libraries

In [None]:
%conda install numpy scipy scikit-learn pandas joblib pytorch

In [None]:
%pip install deap update_checker tqdm stopit xgboost

In [None]:
%pip install tpot

### Initialize TPOT for Genetic Programming on the CPU

The following code initializes a TPOT classifier for genetic programming on the CPU. The classifier is trained on the vectorized text data and evaluated to determine its performance. The best model is then exported for future use.

In [None]:
import os

# taking care of a warning message
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

import polars as pl
import re
from transformers import BertTokenizer, BertModel
import torch
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### Building the feature vector

Here a feature vector is build to extract the relevant features from Sysmon traces. The feature vector is then used to train the classifier with TPOT.

In [None]:
# Extract relevant information using regular expressions
def extract_info(text):
 image = re.search(r"Image: (.*?\.exe)", text, re.IGNORECASE)
 target_filename = re.search(r"TargetFilename: (.*?\.exe)", text, re.IGNORECASE)
 return {
 "image": image.group(1) if image else "",
 "target_filename": target_filename.group(1) if target_filename else "",
 "text": text
 }

In [None]:
# Apply extraction to the Polars DataFrame using map_elements
selected_columns_df = selected_columns_df.with_columns(
 pl.col("filtered_message").map_elements(lambda x: extract_info(x), return_dtype=pl.Object).alias("extracted_info")
)

In [None]:
# Extract fields from the extracted_info column using map_elements with return_dtype
selected_columns_df = selected_columns_df.with_columns(
 pl.col("extracted_info").map_elements(lambda x: x['image'], return_dtype=pl.Utf8).alias("image"),
 pl.col("extracted_info").map_elements(lambda x: x['target_filename'], return_dtype=pl.Utf8).alias("target_filename"),
 pl.col("extracted_info").map_elements(lambda x: x['text'], return_dtype=pl.Utf8).alias("text")
).drop("extracted_info")

In [None]:
print(selected_columns_df)

#### Define the label based on conditions

The following code defines the label based on specific conditions. The conditions are applied to the image and target_filename columns to determine whether the event is malicious or benign. The label is then assigned accordingly. This step is crucial for training the TPOT classifier.

This is a single-label classification problem, where the label is binary (good or bad).

In [None]:
def define_label(row):
 conditions = {
 ("EXCEL.EXE" in row['image'] and ".exe" in row['target_filename']): "bad",
 (row['index'] == 874): "bad",
 # Add more conditions here if needed
 }
 return conditions.get(True, "good")

In [None]:
# Apply the define_label function
selected_columns_df = selected_columns_df.with_columns(
 pl.struct(["index", "image", "target_filename"]).map_elements(define_label, return_dtype=pl.Utf8).alias("label")
)

In [None]:
print(selected_columns_df)

In [None]:
bad_rows = selected_columns_df.filter(pl.col("label") == "bad")
print(bad_rows)

### Vectorizing the text data using BERT

The following code demonstrates how to vectorize the text data using BERT. The vectorized text data is then used as input for the TPOT classifier. The BERT model is loaded and applied to the text column in the DataFrame to generate the feature vector.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def vectorize_text(text):
 inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
 outputs = model(**inputs)
 return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Apply vectorization to the Polars DataFrame using map_elements
selected_columns_df = selected_columns_df.with_columns(
 pl.col("text").map_elements(lambda x: vectorize_text(x).flatten(), return_dtype=pl.Object).alias("text_vector")
)

print(selected_columns_df)

In [10]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Get the maximum number of tokens
max_tokens = tokenizer.model_max_length
print("Maximum number of tokens:", max_tokens)



Maximum number of tokens: 512


In [None]:
df = selected_columns_df.to_pandas()

# Save the Pandas DataFrame to a Parquet file
df.to_parquet("vectorized_texts.parquet")

In [1]:
import pandas as pd
# Load the DataFrame from the Parquet file
loaded_df = pd.read_parquet("vectorized_texts.parquet")

# Verify the loaded DataFrame
print(loaded_df)

 index log.level winlog.event_id \
0 0 information 10 
1 1 information 10 
2 2 information 1 
3 3 information 13 
4 4 information 1 
... ... ... ... 
1022 1022 information 1 
1023 1023 information 10 
1024 1024 information 1 
1025 1025 information 22 
1026 1026 information 1 

 winlog.task \
0 Process accessed (rule: ProcessAccess) 
1 Process accessed (rule: ProcessAccess) 
2 Process Create (rule: ProcessCreate) 
3 Registry value set (rule: RegistryEvent) 
4 Process Create (rule: ProcessCreate) 
... ... 
1022 Process Create (rule: ProcessCreate) 
1023 Process accessed (rule: ProcessAccess) 
1024 Process Create (rule: ProcessCreate) 
1025 Dns query (rule: DnsQuery) 
1026 Process Create (rule: ProcessCreate) 

 filtered_message \
0 Process accessed:\nRuleName: -\nSourceProcessI... 
1 Process accessed:\nRuleName: -\nSourceProcessI... 
2 Process Create:\nRuleName: -\nProcessId: 5196\... 
3 Registry value set:\nRuleName: Tamper-Winlogon... 
4 Process Create:\nRuleName: -\nProcessId: 6140\..

In [2]:
print(loaded_df.iloc[833])

index 833
log.level information
winlog.event_id 1
winlog.task Process Create (rule: ProcessCreate)
filtered_message Process Create:\nRuleName: -\nProcessId: 7680\...
image C:\Users\student\AppData\Local\Temp\file.exe
target_filename 
text Process Create:\nRuleName: -\nProcessId: 7680\...
label good
text_vector [-0.26701870560646057, 0.045040227472782135, 0...
Name: 833, dtype: object


In [9]:
# Select row 833
row_833 = loaded_df.iloc[833]

# Open a file in write mode
with open('output_row_833.txt', 'w') as file:
 # Iterate over each item in the row and write it to the file
 for column, value in row_833.items():
 file.write(f"{column}: {value}\n")

print("Row 833 has been printed to 'output_row_833.txt'")

Row 833 has been printed to 'output_row_833.txt'


In [8]:
# Retrieve the 'text_vector' and print its content and length
text_vector = row_833['text_vector']
print("text_vector:", text_vector)
print("Number of elements in 'text_vector':", len(text_vector))

text_vector: [-2.67018706e-01 4.50402275e-02 4.92597967e-01 -1.67466640e-01
 5.49281299e-01 -9.46303830e-02 1.31402090e-02 2.45818749e-01
 -3.08580045e-02 -5.99840544e-02 -4.08833206e-01 -3.07607472e-01
 -2.50869066e-01 3.46208543e-01 6.52808100e-02 4.03921425e-01
 -1.08293675e-01 1.94034234e-01 -1.76812530e-01 1.86863258e-01
 2.65023381e-01 -6.35761321e-02 -1.79275349e-01 4.30784822e-01
 4.68610853e-01 -4.75924127e-02 -5.95399998e-02 -1.90991446e-01
 -4.59939897e-01 3.03420693e-01 2.12900817e-01 -1.35921072e-02
 8.98968354e-02 -1.90729022e-01 5.30772842e-02 -1.82377368e-01
 -3.18339169e-02 -7.27270097e-02 1.21271454e-01 3.73404026e-01
 -2.88292766e-01 -7.65656829e-01 1.46813408e-01 -1.36955306e-01
 -8.59620795e-02 -5.23494661e-01 2.55036026e-01 2.86784172e-01
 1.17207281e-01 1.32331714e-01 -5.52943230e-01 1.82576746e-01
 -1.92080103e-02 -1.37846798e-01 2.42283568e-01 5.64341307e-01
 5.23214936e-01 -5.78589380e-01 -3.80595475e-01 -2.89306790e-01
 1.57396853e-01 2.33618170e-01 5.4761003

In [None]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTClassifier

# Load the DataFrame from the Parquet file
df = pd.read_parquet("vectorized_texts.parquet")

# Ensure to use only CPU for PyTorch
device = torch.device("cpu")

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'].tolist(), df['label_encoded'], test_size=0.2, random_state=42)

# Convert lists to numpy arrays
X_train = torch.tensor(X_train, device=device).numpy()
X_test = torch.tensor(X_test, device=device).numpy()

# TPOT classifier with higher verbosity
tpot = TPOTClassifier(verbosity=3, generations=5, population_size=20)
tpot.fit(X_train, y_train)

# Evaluate the model
print("TPOT Score:", tpot.score(X_test, y_test))

# Save the trained model
tpot.export('tpot_pipeline.py')

# Print the exported pipeline
with open('tpot_pipeline.py') as f:
 print(f.read())

# Example of using the trained model
predictions = tpot.predict(X_test)
print("Predictions:", predictions)


In [None]:
print("The accuracy of the best model is: ", tpot.score(X_test, y_test))


In [None]:
%pip install matplotlib

In [None]:
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tpot import TPOTClassifier
from collections import Counter

# Load the DataFrame from the Parquet file
df = pd.read_parquet("vectorized_texts.parquet")

# Ensure to use only CPU for PyTorch
device = torch.device("cpu")

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'].tolist(), df['label_encoded'], test_size=0.2, random_state=42)

# Convert lists to numpy arrays
X_train = torch.tensor(X_train, device=device).numpy()
X_test = torch.tensor(X_test, device=device).numpy()

# TPOT classifier with higher verbosity
tpot = TPOTClassifier(verbosity=3, generations=5, population_size=20)
tpot.fit(X_train, y_train)

# Evaluate the model
print("TPOT Score:", tpot.score(X_test, y_test))

# Save the trained model
tpot.export('tpot_pipeline.py')

# Print the exported pipeline
with open('tpot_pipeline.py') as f:
 print(f.read())

# Example of using the trained model
predictions = tpot.predict(X_test)
print("Predictions:", predictions)

# Extract information about models tested
evaluated_pipelines = tpot.evaluated_individuals_


In [None]:
# Count occurrences of each model type
model_counter = Counter()
for pipeline_str in evaluated_pipelines.keys():
 models = re.findall(r'\w+\(.*?\)', pipeline_str)
 for model in models:
 model_name = model.split('(')[0]
 model_counter[model_name] += 1

print("Models and their occurrences:")
for model, count in model_counter.items():
 print(f"{model}: {count}")

# Visualize the count of different models
import matplotlib.pyplot as plt

model_names = list(model_counter.keys())
model_counts = list(model_counter.values())

plt.figure(figsize=(12, 6))
plt.barh(model_names, model_counts, color='skyblue')
plt.xlabel('Number of Occurrences')
plt.ylabel('Model')
plt.title('Frequency of Models Tested by TPOT')
plt.show()