Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
DVKunion committed Jul 31, 2023
1 parent ad68146 commit 75675c7
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 2 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/collect.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Collect

on:
push:
branches: [ "main" ]
# schedule:
# - cron: '0 0 * * *'

jobs:
collect-github:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
name: Set up Python
with:
python-version: '3.10'
- name: Prepare
run: pip3 install -r requirement.txt
- name: Crawler
run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }}
- uses: actions/upload-artifact@v3
with:
name: result.json
path: result.json

2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -157,4 +157,4 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
# customs
# Customs

镜像海关 - Github Action自动化 Docker 镜像数据收集与分析,为云安全研究提供数据支持。

115 changes: 115 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import base64
import json
import re
import sys
import time

import requests
from dockerfile_parse import DockerfileParser

# 替换成你的GitHub Personal Access Token
ACCESS_TOKEN = ''

result = []
while_repo_list = ["docker_repair"]


def calculate_execution_time(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
execution_time = end_time - start_time
print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.")
return result

return wrapper


@calculate_execution_time
def get_dockerfile_images():
url = 'https://api.github.com/search/code'
params = {'q': 'FROM language:Dockerfile', "page": "1", "per_page": "100"}
headers = {'Authorization': f'Bearer {ACCESS_TOKEN}', "Accept": "application/vnd.github+json"}

response = requests.get(url, headers=headers, params=params)
response_json = response.json()
if response.status_code != 200:
print(response.json())
print(response.json())
parse_result(response_json)
total = response_json["total_count"]
print(total)
for page in range(2, 100):
params["page"] = str(page)
response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
print(response.json())
response_json = response.json()
parse_result(response_json)

with open("result.json", "w+") as file:
file.write(json.dumps(result))


def parse_result(response_json):
for item in response_json["items"]:
res = dict()
res['filename'] = item["name"]
res['path'] = item["path"]
res['repo_name'] = item["repository"]["name"]
if res['repo_name'] in while_repo_list:
continue
url = item["url"]
res['detail_url'] = url
print(f"{res['filename']}:{res['path']} - {res['detail_url']}")
images = []
# need get
resp_detail = requests.get(url, headers=headers)
resp_detail_json = resp_detail.json()
contents_base64 = resp_detail_json["content"].replace("\n", "")
decoded_bytes = base64.b64decode(contents_base64)
decoded_string = decoded_bytes.decode('utf-8')
dfp = DockerfileParser()
dfp.content = decoded_string
alias = dict()
for line in dfp.structure:
if line["instruction"] == "FROM":
values = line["value"].split(" ")
image = values[0]
# find replace value:
arg_pattern = r'\$\{(\w+)\}'
arg_matches = re.findall(arg_pattern, image)
for arg_name in arg_matches:
arg_value = get_arg_value(arg_name, decoded_string)
if arg_value:
image = image.replace(f'${{{arg_name}}}', arg_value)
# transfer alias
if image in alias:
image = alias[image]
for v in values:
if v.lower() == "as":
alias[values[-1]] = image

images.append(image)
res['images'] = images
result.append(res)


def get_arg_value(arg_name, dockerfile):
arg_pattern = rf'ARG\s+{arg_name}=([^\n\r]+)'
arg_match = re.search(arg_pattern, dockerfile)

if arg_match:
return arg_match.group(1).strip().replace("\"", "").replace("'", "")
else:
return None


if __name__ == '__main__':
args = sys.argv[1:]
ACCESS_TOKEN = args[0]
if ACCESS_TOKEN == "":
print("empty token")
sys.exit()
get_dockerfile_images()
4 changes: 4 additions & 0 deletions requirement.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
requests
pysocks==1.7.1
urllib3==2.0.4
dockerfile-parse==2.0.1

0 comments on commit 75675c7

Please sign in to comment.