# GitHub for data releases 

# Introduction

In [7]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [53]:
%pip install PyGithub

Collecting PyGithub
  Downloading PyGithub-2.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Using cached PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Collecting pyjwt>=2.4.0 (from pyjwt[crypto]>=2.4.0->PyGithub)
  Downloading PyJWT-2.8.0-py3-none-any.whl.metadata (4.2 kB)
Downloading PyGithub-2.3.0-py3-none-any.whl (354 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.4/354.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading PyJWT-2.8.0-py3-none-any.whl (22 kB)
Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pyjwt, pynacl, PyGithub
Successfully installed PyGithub-2.3.0 pyjwt-2.8.0 pynacl-1.5.0
Note: you

In [2]:
from dotenv import load_dotenv
import os

load_dotenv("thesis_env_ro", verbose=True)  # take environment variables from the file
token = os.getenv('GITHUB_PERSONAL_ACCESS_TOKEN')

In [14]:
from github import Github
import requests
from tqdm.notebook import tqdm


def get_specific_file_from_tagged_release(token, repo_name, tag_name, filename):
    g = Github(token)
    repo = g.get_repo(repo_name)
    releases = repo.get_releases()

    for release in releases:
        if release.tag_name == tag_name:
            for asset in release.get_assets():
                if asset.name == filename:
                    return asset.url
    print("File not found. Try get_specific_file_from_latest_release() instead.")
    return None

def get_specific_file_from_latest_release(token, repo_name, filename):
    g = Github(token)
    repo = g.get_repo(repo_name)
    release = repo.get_latest_release()

    for asset in release.get_assets():
        if asset.name == filename:
            return asset.url  # Use asset.url which points to API URL needing headers

def download_file(url, token, save_path):
    headers = {'Authorization': f'token {token}', 'Accept': 'application/octet-stream'}
    # First request to handle GitHub's redirection and authentication properly
    with requests.get(url, headers=headers, stream=True) as initial_response:
        initial_response.raise_for_status()  # Ensure the initial request is successful
        # Follow redirection if necessary, maintaining headers
        if initial_response.history:
            url = initial_response.url  # Updated URL after redirection

        # Now, proceed with downloading the file
        with requests.get(url, headers=headers, stream=True) as response:
            response.raise_for_status()
            total_size_in_bytes = int(response.headers.get('content-length', 0))
            block_size = 1024
            
            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
            with open(save_path, 'wb') as file:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
            progress_bar.close()

            if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
                print("ERROR, something went wrong")
            else:
                print(f"File downloaded successfully and saved as {save_path}")

# Your GitHub token
github_token = token

# Repository name
repository_name = "norandom/log2ml"

# File name to search for
file_name = "lab_logs_normal_activity_may_15_2024.json"

# Get the download URL of the specific file
# download_url = get_specific_file_from_latest_release(github_token, repository_name, file_name)
download_url = get_specific_file_from_tagged_release(github_token, repository_name, "foundations", file_name)
print(download_url)

if download_url:
    local_file_path = "lab_logs_normal_activity_may_15_2024.csv"
    download_file(download_url, github_token, local_file_path)
else:
    print("File not found.")


https://api.github.com/repos/norandom/log2ml/releases/assets/168114916


  0%|          | 0.00/6.28M [00:00<?, ?iB/s]

File downloaded successfully and saved as lab_logs_normal_activity_may_15_2024.csv


In [16]:
!wc -l lab_logs_normal_activity_may_15_2024.csv

8000 lab_logs_normal_activity_may_15_2024.csv
