mirror of
https://github.com/norandom/log2ml.git
synced 2024-12-04 22:53:44 +00:00
Prototype Beta, Graph Convolutional Network Embeddings with BERT with Sysmon data
This commit is contained in:
parent
ac1311744c
commit
da68bc33ba
301
Prototype_Beta_GCN_BERT_Embedding_Sysmon.ipynb
Normal file
301
Prototype_Beta_GCN_BERT_Embedding_Sysmon.ipynb
Normal file
@ -0,0 +1,301 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "53fefa0e-f261-47a5-9481-9c00e0d68e25",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
|
||||
"Requirement already satisfied: torch==2.2.1+cpu in /home/marius/anaconda3/lib/python3.11/site-packages (2.2.1+cpu)\n",
|
||||
"Requirement already satisfied: filelock in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.9.0)\n",
|
||||
"Requirement already satisfied: typing-extensions>=4.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (4.11.0)\n",
|
||||
"Requirement already satisfied: sympy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (1.11.1)\n",
|
||||
"Requirement already satisfied: networkx in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1)\n",
|
||||
"Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (3.1.2)\n",
|
||||
"Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch==2.2.1+cpu) (2023.4.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch==2.2.1+cpu) (2.1.1)\n",
|
||||
"Requirement already satisfied: mpmath>=0.19 in /home/marius/anaconda3/lib/python3.11/site-packages (from sympy->torch==2.2.1+cpu) (1.3.0)\n",
|
||||
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"Requirement already satisfied: torch-geometric in /home/marius/anaconda3/lib/python3.11/site-packages (2.5.3)\n",
|
||||
"Requirement already satisfied: tqdm in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (4.65.0)\n",
|
||||
"Requirement already satisfied: numpy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.24.3)\n",
|
||||
"Requirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.11.4)\n",
|
||||
"Requirement already satisfied: fsspec in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (2023.4.0)\n",
|
||||
"Requirement already satisfied: jinja2 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.1.2)\n",
|
||||
"Requirement already satisfied: aiohttp in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.8.5)\n",
|
||||
"Requirement already satisfied: requests in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (2.31.0)\n",
|
||||
"Requirement already satisfied: pyparsing in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (3.0.9)\n",
|
||||
"Requirement already satisfied: scikit-learn in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (1.3.0)\n",
|
||||
"Requirement already satisfied: psutil>=5.8.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-geometric) (5.9.0)\n",
|
||||
"Requirement already satisfied: attrs>=17.3.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (22.1.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (2.0.4)\n",
|
||||
"Requirement already satisfied: multidict<7.0,>=4.5 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (6.0.2)\n",
|
||||
"Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (4.0.2)\n",
|
||||
"Requirement already satisfied: yarl<2.0,>=1.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.8.1)\n",
|
||||
"Requirement already satisfied: frozenlist>=1.1.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.3.3)\n",
|
||||
"Requirement already satisfied: aiosignal>=1.1.2 in /home/marius/anaconda3/lib/python3.11/site-packages (from aiohttp->torch-geometric) (1.2.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from jinja2->torch-geometric) (2.1.1)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (1.26.16)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /home/marius/anaconda3/lib/python3.11/site-packages (from requests->torch-geometric) (2023.11.17)\n",
|
||||
"Requirement already satisfied: joblib>=1.1.1 in /home/marius/anaconda3/lib/python3.11/site-packages (from scikit-learn->torch-geometric) (1.2.0)\n",
|
||||
"Requirement already satisfied: threadpoolctl>=2.0.0 in /home/marius/anaconda3/lib/python3.11/site-packages (from scikit-learn->torch-geometric) (2.2.0)\n",
|
||||
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"Collecting torch-scatter\n",
|
||||
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_scatter-2.1.2%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (511 kB)\n",
|
||||
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m511.8/511.8 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hInstalling collected packages: torch-scatter\n",
|
||||
"Successfully installed torch-scatter-2.1.2+pt22cpu\n",
|
||||
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"Collecting torch-sparse\n",
|
||||
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_sparse-0.6.18%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (1.2 MB)\n",
|
||||
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m0m:02\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-sparse) (1.11.4)\n",
|
||||
"Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /home/marius/anaconda3/lib/python3.11/site-packages (from scipy->torch-sparse) (1.24.3)\n",
|
||||
"Installing collected packages: torch-sparse\n",
|
||||
"Successfully installed torch-sparse-0.6.18+pt22cpu\n",
|
||||
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"Collecting torch-cluster\n",
|
||||
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_cluster-1.6.3%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (776 kB)\n",
|
||||
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m776.6/776.6 kB\u001b[0m \u001b[31m982.1 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m954.7 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: scipy in /home/marius/anaconda3/lib/python3.11/site-packages (from torch-cluster) (1.11.4)\n",
|
||||
"Requirement already satisfied: numpy<1.28.0,>=1.21.6 in /home/marius/anaconda3/lib/python3.11/site-packages (from scipy->torch-cluster) (1.24.3)\n",
|
||||
"Installing collected packages: torch-cluster\n",
|
||||
"Successfully installed torch-cluster-1.6.3+pt22cpu\n",
|
||||
"Looking in links: https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"Collecting torch-spline-conv\n",
|
||||
" Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcpu/torch_spline_conv-1.2.2%2Bpt22cpu-cp311-cp311-linux_x86_64.whl (215 kB)\n",
|
||||
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m215.8/215.8 kB\u001b[0m \u001b[31m427.9 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n",
|
||||
"\u001b[?25hInstalling collected packages: torch-spline-conv\n",
|
||||
"Successfully installed torch-spline-conv-1.2.2+pt22cpu\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install torch==2.2.1+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
|
||||
"!pip install torch-geometric -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"!pip install torch-cluster -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n",
|
||||
"!pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.2.1+cpu.html\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f6c8b735-a54b-4485-bd16-34fad500f60f",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/marius/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
|
||||
" torch.utils._pytree._register_pytree_node(\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import torch\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"from transformers import BertTokenizer, BertModel\n",
|
||||
"import networkx as nx\n",
|
||||
"from torch_geometric.data import Data\n",
|
||||
"from torch_geometric.nn import GCNConv\n",
|
||||
"from torch_geometric.utils import from_networkx"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "ee9877b3-ece3-4f37-bc1b-73e0994325c7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"GCN embeddings shape: torch.Size([7, 768])\n",
|
||||
"tensor([[-0.2238, -0.1572, 0.0377, ..., -0.1945, -0.2714, 0.7107],\n",
|
||||
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
|
||||
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
|
||||
" ...,\n",
|
||||
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
|
||||
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644],\n",
|
||||
" [-0.0212, -0.0117, 0.0010, ..., -0.0172, -0.0238, 0.0644]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import torch\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import networkx as nx\n",
|
||||
"from transformers import BertTokenizer, BertModel\n",
|
||||
"from torch_geometric.data import Data\n",
|
||||
"from torch_geometric.nn import GCNConv\n",
|
||||
"from torch_geometric.utils import from_networkx\n",
|
||||
"\n",
|
||||
"# Sample CSV data\n",
|
||||
"data = {\n",
|
||||
" \"Timestamp\": [\"2024-05-15T16:00:15.887Z\"],\n",
|
||||
" \"OS\": [\"win10\"],\n",
|
||||
" \"IP\": [\"fe80::24b4:3691:44a6:38a1\"],\n",
|
||||
" \"LogLevel\": [\"information\"],\n",
|
||||
" \"EventID\": [5379],\n",
|
||||
" \"Category\": [\"User Account Management\"],\n",
|
||||
" \"Description\": [\n",
|
||||
" \"\"\"Credential Manager credentials were read.\n",
|
||||
"\n",
|
||||
" Subject:\n",
|
||||
" Security ID: S-1-5-18\n",
|
||||
" Account Name: WIN10$\n",
|
||||
" Account Domain: sec699-20\n",
|
||||
" Logon ID: 0x3E7\n",
|
||||
" Read Operation: Enumerate Credentials\n",
|
||||
"\n",
|
||||
" This event occurs when a user performs a read operation on stored credentials in Credential Manager.\"\"\"\n",
|
||||
" ]\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"# Convert to DataFrame\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"\n",
|
||||
"# Function to parse hierarchical data from description\n",
|
||||
"def parse_description(description):\n",
|
||||
" hierarchy = {}\n",
|
||||
" lines = description.split('\\n')\n",
|
||||
" key = None\n",
|
||||
" for line in lines:\n",
|
||||
" line = line.strip()\n",
|
||||
" if not line:\n",
|
||||
" continue\n",
|
||||
" if ':' in line:\n",
|
||||
" parts = line.split(':', 1)\n",
|
||||
" key = parts[0].strip()\n",
|
||||
" value = parts[1].strip()\n",
|
||||
" hierarchy[key] = value\n",
|
||||
" elif key:\n",
|
||||
" hierarchy[key] += ' ' + line.strip()\n",
|
||||
" return hierarchy\n",
|
||||
"\n",
|
||||
"# Apply parsing to the Description column\n",
|
||||
"df['ParsedDescription'] = df['Description'].apply(parse_description)\n",
|
||||
"\n",
|
||||
"# Initialize graph\n",
|
||||
"G = nx.Graph()\n",
|
||||
"\n",
|
||||
"# Load pre-trained BERT model and tokenizer\n",
|
||||
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
|
||||
"model = BertModel.from_pretrained('bert-base-uncased')\n",
|
||||
"\n",
|
||||
"# Add nodes and edges to the graph\n",
|
||||
"for index, row in df.iterrows():\n",
|
||||
" parsed = row['ParsedDescription']\n",
|
||||
" description = row['Description']\n",
|
||||
" \n",
|
||||
" # Tokenize and encode the description\n",
|
||||
" inputs = tokenizer(description, return_tensors='pt', max_length=512, truncation=True, padding='max_length')\n",
|
||||
" outputs = model(**inputs)\n",
|
||||
" description_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy().flatten()\n",
|
||||
" \n",
|
||||
" # Add event node with BERT embedding\n",
|
||||
" event_node = 'Event_{}'.format(index)\n",
|
||||
" G.add_node(event_node, type='event', embedding=description_embedding)\n",
|
||||
" \n",
|
||||
" # Add other nodes and connect to event node\n",
|
||||
" for key, value in parsed.items():\n",
|
||||
" if not G.has_node(value):\n",
|
||||
" G.add_node(value, type=key, embedding=torch.zeros(768).numpy()) # Add default embedding for non-event nodes\n",
|
||||
" G.add_edge(event_node, value)\n",
|
||||
"\n",
|
||||
"# Convert NetworkX graph to PyTorch Geometric data\n",
|
||||
"data = from_networkx(G)\n",
|
||||
"\n",
|
||||
"# Add node features to the data object\n",
|
||||
"node_embeddings = [G.nodes[node]['embedding'] for node in G.nodes]\n",
|
||||
"data.x = torch.tensor(node_embeddings, dtype=torch.float)\n",
|
||||
"\n",
|
||||
"# Define a simple GCN model\n",
|
||||
"class GCN(torch.nn.Module):\n",
|
||||
" def __init__(self):\n",
|
||||
" super(GCN, self).__init__()\n",
|
||||
" self.conv1 = GCNConv(data.num_node_features, 256)\n",
|
||||
" self.conv2 = GCNConv(256, 128)\n",
|
||||
" self.conv3 = GCNConv(128, 768) # Output size matches the input embedding size\n",
|
||||
"\n",
|
||||
" def forward(self, data):\n",
|
||||
" x, edge_index = data.x, data.edge_index\n",
|
||||
" x = self.conv1(x, edge_index)\n",
|
||||
" x = F.relu(x)\n",
|
||||
" x = self.conv2(x, edge_index)\n",
|
||||
" x = F.relu(x)\n",
|
||||
" x = self.conv3(x, edge_index)\n",
|
||||
" return x\n",
|
||||
"\n",
|
||||
"# Create and train the model\n",
|
||||
"model = GCN()\n",
|
||||
"optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\n",
|
||||
"\n",
|
||||
"model.train()\n",
|
||||
"for epoch in range(200):\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" out = model(data)\n",
|
||||
" loss = F.mse_loss(out, data.x) # Dummy loss for example purposes\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"# Extract the embeddings from the trained model\n",
|
||||
"model.eval()\n",
|
||||
"with torch.no_grad():\n",
|
||||
" embeddings = model(data)\n",
|
||||
"\n",
|
||||
"# Print the embeddings\n",
|
||||
"print(\"GCN embeddings shape:\", embeddings.shape)\n",
|
||||
"print(embeddings)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bdb7c728-a8c2-4e89-bea8-d0f31585a7cc",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Reference in New Issue
Block a user