From 75675c758481e422565abccce883bb59a5aa8948 Mon Sep 17 00:00:00 2001 From: DVKunion Date: Sun, 30 Jul 2023 22:38:12 +0800 Subject: [PATCH] init --- .github/workflows/collect.yml | 26 ++++++++ .gitignore | 2 +- README.md | 5 +- main.py | 115 ++++++++++++++++++++++++++++++++++ requirement.txt | 4 ++ 5 files changed, 150 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/collect.yml create mode 100644 main.py create mode 100644 requirement.txt diff --git a/.github/workflows/collect.yml b/.github/workflows/collect.yml new file mode 100644 index 0000000..50f9936 --- /dev/null +++ b/.github/workflows/collect.yml @@ -0,0 +1,26 @@ +name: Collect + +on: + push: + branches: [ "main" ] +# schedule: +# - cron: '0 0 * * *' + +jobs: + collect-github: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + name: Set up Python + with: + python-version: '3.10' + - name: Prepare + run: pip3 install -r requirement.txt + - name: Crawler + run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }} + - uses: actions/upload-artifact@v3 + with: + name: result.json + path: result.json + diff --git a/.gitignore b/.gitignore index 68bc17f..2dc53ca 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,4 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ diff --git a/README.md b/README.md index c4a83a4..2650e6a 100644 --- a/README.md +++ b/README.md @@ -1 +1,4 @@ -# customs \ No newline at end of file +# Customs + +镜像海关 - Github Action自动化 Docker 镜像数据收集与分析,为云安全研究提供数据支持。 + diff --git a/main.py b/main.py new file mode 100644 index 0000000..58bbd55 --- /dev/null +++ b/main.py @@ -0,0 +1,115 @@ +import base64 +import json +import re +import sys +import time + +import requests +from dockerfile_parse import DockerfileParser + +# 替换成你的GitHub Personal Access Token +ACCESS_TOKEN = '' + +result = [] +while_repo_list = ["docker_repair"] + + +def calculate_execution_time(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + execution_time = end_time - start_time + print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.") + return result + + return wrapper + + +@calculate_execution_time +def get_dockerfile_images(): + url = 'https://api.github.com/search/code' + params = {'q': 'FROM language:Dockerfile', "page": "1", "per_page": "100"} + headers = {'Authorization': f'Bearer {ACCESS_TOKEN}', "Accept": "application/vnd.github+json"} + + response = requests.get(url, headers=headers, params=params) + response_json = response.json() + if response.status_code != 200: + print(response.json()) + print(response.json()) + parse_result(response_json) + total = response_json["total_count"] + print(total) + for page in range(2, 100): + params["page"] = str(page) + response = requests.get(url, headers=headers, params=params) + if response.status_code != 200: + print(response.json()) + response_json = response.json() + parse_result(response_json) + + with open("result.json", "w+") as file: + file.write(json.dumps(result)) + + +def parse_result(response_json): + for item in response_json["items"]: + res = dict() + res['filename'] = item["name"] + res['path'] = item["path"] + res['repo_name'] = item["repository"]["name"] + if res['repo_name'] in while_repo_list: + continue + url = item["url"] + res['detail_url'] = url + print(f"{res['filename']}:{res['path']} - {res['detail_url']}") + images = [] + # need get + resp_detail = requests.get(url, headers=headers) + resp_detail_json = resp_detail.json() + contents_base64 = resp_detail_json["content"].replace("\n", "") + decoded_bytes = base64.b64decode(contents_base64) + decoded_string = decoded_bytes.decode('utf-8') + dfp = DockerfileParser() + dfp.content = decoded_string + alias = dict() + for line in dfp.structure: + if line["instruction"] == "FROM": + values = line["value"].split(" ") + image = values[0] + # find replace value: + arg_pattern = r'\$\{(\w+)\}' + arg_matches = re.findall(arg_pattern, image) + for arg_name in arg_matches: + arg_value = get_arg_value(arg_name, decoded_string) + if arg_value: + image = image.replace(f'${{{arg_name}}}', arg_value) + # transfer alias + if image in alias: + image = alias[image] + for v in values: + if v.lower() == "as": + alias[values[-1]] = image + + images.append(image) + res['images'] = images + result.append(res) + + +def get_arg_value(arg_name, dockerfile): + arg_pattern = rf'ARG\s+{arg_name}=([^\n\r]+)' + arg_match = re.search(arg_pattern, dockerfile) + + if arg_match: + return arg_match.group(1).strip().replace("\"", "").replace("'", "") + else: + return None + + +if __name__ == '__main__': + args = sys.argv[1:] + ACCESS_TOKEN = args[0] + if ACCESS_TOKEN == "": + print("empty token") + sys.exit() + get_dockerfile_images() diff --git a/requirement.txt b/requirement.txt new file mode 100644 index 0000000..a8fe566 --- /dev/null +++ b/requirement.txt @@ -0,0 +1,4 @@ +requests +pysocks==1.7.1 +urllib3==2.0.4 +dockerfile-parse==2.0.1 \ No newline at end of file