mirror of
https://github.com/norandom/log2ml.git
synced 2024-12-04 22:53:44 +00:00
renamed github data manager, cleaned up code
This commit is contained in:
parent
2dbd19292e
commit
def64cb79d
@ -1,13 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ebb1428f6428646",
|
||||
"cell_type": "markdown",
|
||||
"id": "fa74a82c3dc6db1a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install pandas\n"
|
||||
"# GitHub for data releases "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "38c6a6b67bd16e42",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Introduction"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -78,7 +84,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%pip install PyGithub\n"
|
||||
"%pip install PyGithub"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -102,12 +108,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 14,
|
||||
"id": "1069e0bfa4686f67",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-05-11T16:23:55.511058Z",
|
||||
"start_time": "2024-05-11T16:23:23.511924Z"
|
||||
"end_time": "2024-05-16T17:44:25.085423Z",
|
||||
"start_time": "2024-05-16T17:44:21.570849Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@ -115,18 +121,18 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"https://api.github.com/repos/norandom/log2ml/releases/assets/166259205\n"
|
||||
"https://api.github.com/repos/norandom/log2ml/releases/assets/168114916\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "7d46bf8efe9e4e0591950e029c353573",
|
||||
"model_id": "5f0622d443bc48728f0e7ed72ebd7fab",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
" 0%| | 0.00/1.90G [00:00<?, ?iB/s]"
|
||||
" 0%| | 0.00/6.28M [00:00<?, ?iB/s]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
@ -136,7 +142,7 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"File downloaded successfully and saved as lab_logs_normal_activity_may_11_2024.csv\n"
|
||||
"File downloaded successfully and saved as lab_logs_normal_activity_may_15_2024.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -202,7 +208,7 @@
|
||||
"repository_name = \"norandom/log2ml\"\n",
|
||||
"\n",
|
||||
"# File name to search for\n",
|
||||
"file_name = \"lab_logs_normal_activity_may_6_2024.json\"\n",
|
||||
"file_name = \"lab_logs_normal_activity_may_15_2024.json\"\n",
|
||||
"\n",
|
||||
"# Get the download URL of the specific file\n",
|
||||
"# download_url = get_specific_file_from_latest_release(github_token, repository_name, file_name)\n",
|
||||
@ -210,7 +216,7 @@
|
||||
"print(download_url)\n",
|
||||
"\n",
|
||||
"if download_url:\n",
|
||||
" local_file_path = \"lab_logs_normal_activity_may_11_2024.csv\"\n",
|
||||
" local_file_path = \"lab_logs_normal_activity_may_15_2024.csv\"\n",
|
||||
" download_file(download_url, github_token, local_file_path)\n",
|
||||
"else:\n",
|
||||
" print(\"File not found.\")\n"
|
||||
@ -218,12 +224,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 16,
|
||||
"id": "393703bd6e7a693f",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-05-12T07:36:38.888903Z",
|
||||
"start_time": "2024-05-12T07:36:38.468176Z"
|
||||
"end_time": "2024-05-16T17:44:33.762701Z",
|
||||
"start_time": "2024-05-16T17:44:33.641528Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@ -231,117 +237,13 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 lab_logs_normal_activity_may_11_2024.csv\r\n"
|
||||
"8000 lab_logs_normal_activity_may_15_2024.csv\r\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!wc -l lab_logs_normal_activity_may_11_2024.csv"
|
||||
"!wc -l lab_logs_normal_activity_may_15_2024.csv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6b35fdc991ccea39",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Flattening"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a293810e0531690c",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"start_time": "2024-05-11T15:26:15.019788Z"
|
||||
},
|
||||
"jupyter": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import json\n",
|
||||
"\n",
|
||||
"# Process each line of the JSON file\n",
|
||||
"with open('lab_logs_normal_activity_may_11_2024.json', 'r') as file:\n",
|
||||
" for line in file:\n",
|
||||
" # Normalize the JSON object from the line\n",
|
||||
" data = json.loads(line)\n",
|
||||
" temp_df = pd.json_normalize(data)\n",
|
||||
"\n",
|
||||
" # Append the DataFrame to a growing CSV file\n",
|
||||
" temp_df.to_csv('lab_logs_normal_activity_may_11_2024_flat.csv', mode='a', header=False, index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "771af611ba60a456",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"flattened_df.head(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a4f782c59bb52c3f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Save the DataFrame to a CSV file\n",
|
||||
"df.to_csv('lab_logs_normal_activity_may_6_2024.csv', index=False)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "803d4a7af2927bc8",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-05-07T11:43:06.171446Z",
|
||||
"start_time": "2024-05-07T11:42:52.776821Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting pandas\r\n",
|
||||
" Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)\r\n",
|
||||
"Requirement already satisfied: numpy>=1.23.2 in /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages (from pandas) (1.26.4)\r\n",
|
||||
"Requirement already satisfied: python-dateutil>=2.8.2 in /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages (from pandas) (2.9.0)\r\n",
|
||||
"Requirement already satisfied: pytz>=2020.1 in /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages (from pandas) (2024.1)\r\n",
|
||||
"Collecting tzdata>=2022.7 (from pandas)\r\n",
|
||||
" Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)\r\n",
|
||||
"Requirement already satisfied: six>=1.5 in /home/marius/miniconda3/envs/llm_langchain/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\r\n",
|
||||
"Downloading pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)\r\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.0/13.0 MB\u001b[0m \u001b[31m26.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\r\n",
|
||||
"\u001b[?25hUsing cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)\r\n",
|
||||
"Installing collected packages: tzdata, pandas\r\n",
|
||||
"Successfully installed pandas-2.2.2 tzdata-2024.1\r\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4dda90a02f3fb809",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
Loading…
Reference in New Issue
Block a user