diff --git a/.github/workflows/collect.yml b/.github/workflows/collect.yml index 50f9936..68df498 100644 --- a/.github/workflows/collect.yml +++ b/.github/workflows/collect.yml @@ -1,10 +1,17 @@ name: Collect +# auto collection on: - push: - branches: [ "main" ] -# schedule: -# - cron: '0 0 * * *' + workflow_dispatch: + inputs: + date: + description: 'crawler date time' + required: true + token: + description: 'select token' + required: true + schedule: + - cron: '0 15 * * *' jobs: collect-github: @@ -18,9 +25,17 @@ jobs: - name: Prepare run: pip3 install -r requirement.txt - name: Crawler - run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }} + run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }} ${{ inputs.date }} ${{ inputs.token }} + - name: Upload Repos + run: | + git config --local user.email "action@github.com" + git config --local user.name "Github Action" + git remote set-url origin https://${{ github.actor }}:$(echo "${{ secrets.CUSTOMS_GITHUB_TOKEN }}" | cut -d ',' -f 1)@github.com/${{ github.repository }} + git pull --rebase + git add . + git commit -m "Auto Update `date --rfc-3339=seconds`" + git push -f - uses: actions/upload-artifact@v3 with: - name: result.json - path: result.json - + name: github + path: github-*.json diff --git a/.gitignore b/.gitignore index 2dc53ca..a998606 100644 --- a/.gitignore +++ b/.gitignore @@ -158,3 +158,4 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ +Dockerfile \ No newline at end of file diff --git a/README.md b/README.md index 2650e6a..99fce4d 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,14 @@ 镜像海关 - Github Action自动化 Docker 镜像数据收集与分析,为云安全研究提供数据支持。 +基于`GitHub GraphQL API v4` ,获取存在`Dockerfile`的前一天创建的仓库,通过`REST API`去定向搜索具体的`code`,获取所有`FROM`字段 + +依此来逃过`search code API`最多只能获取到1000个的限制。 + +## 一些存在的问题: + +很多repo没有indexed,导致搜索时报错: + +`This repository's code is being indexed right now. Try again in a few minutes.` + +search接口的limit为 10次/分钟,只能依照这个速率进行爬取。 \ No newline at end of file diff --git a/common/__init__.py b/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/common/timer.py b/common/timer.py new file mode 100644 index 0000000..87dfeca --- /dev/null +++ b/common/timer.py @@ -0,0 +1,22 @@ +import time +from datetime import datetime, timedelta + +def calculate_execution_time(func): + def wrapper(*args, **kwargs): + start_time = time.time() + result = func(*args, **kwargs) + end_time = time.time() + execution_time = end_time - start_time + print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.") + return result + + return wrapper + + +def get_previous_day(): + current_datetime = datetime.now() + # 提取日期部分 + current_date = current_datetime.date() + # 使用timedelta减去一天 + previous_day = current_date - timedelta(days=1) + return previous_day diff --git a/github/__init__.py b/github/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/github/github.py b/github/github.py new file mode 100644 index 0000000..4effa21 --- /dev/null +++ b/github/github.py @@ -0,0 +1,108 @@ +import base64 +import json +import re +import time + +import requests + +import common.timer +from github.graphQL import GraphQL +from dockerfile_parse import DockerfileParser + + +class GithubCode: + def __init__(self, token, date=""): + self.url = 'https://api.github.com/search/code' + self.headers = {'Authorization': f'Bearer {token}', "Accept": "application/vnd.github+json"} + self.graphQL = GraphQL(token, date) + self.codes = [] + + @common.timer.calculate_execution_time + def fetch_code_detail(self): + self.graphQL.fetch_all_result() + total_repos = len(self.graphQL.repos) + print(self.graphQL.query_time, "total:", total_repos) + for i in range(0, total_repos): + repo = self.graphQL.repos[i] + response = self.fetch_detail(repo) + response_json = response.json() + retry_cnt = 0 + while response.status_code != 200 and retry_cnt < 3: + if "message" in response_json and "API rate limit" in response_json["message"]: + # 触发频率限制了,休息个50s左右 + print(f"rete limit, now process: {i}/{total_repos}") + time.sleep(55) + response = self.fetch_detail(repo) + response_json = response.json() + retry_cnt += 1 + if retry_cnt > 3: + print("error fetch repo:", repo) + continue + total = response_json["total_count"] + print(repo["nameWithOwner"], total) + repo["details"] = self.parse_result(response_json) + self.codes.append(repo) + + def fetch_detail(self, repo): + params = {'q': f'FROM repo:{repo["nameWithOwner"]} language:Dockerfile', "page": "1", "per_page": "100"} + response = requests.get(self.url, headers=self.headers, params=params) + return response + + def parse_result(self, response_json): + result = [] + for item in response_json["items"]: + res = dict() + res['filename'] = item["name"] + res['path'] = item["path"] + res['repo_name'] = item["repository"]["name"] + url = item["url"] + res['detail_url'] = url + print(f"{res['filename']}:{res['path']} - {res['detail_url']}") + images = [] + # need get + resp_detail = requests.get(url, headers=self.headers) + resp_detail_json = resp_detail.json() + if resp_detail.status_code != 200: + print(resp_detail_json) + contents_base64 = resp_detail_json["content"].replace("\n", "") + decoded_bytes = base64.b64decode(contents_base64) + decoded_string = decoded_bytes.decode('utf-8') + dfp = DockerfileParser() + dfp.content = decoded_string + alias = dict() + for line in dfp.structure: + if line["instruction"] == "FROM": + values = line["value"].split(" ") + image = values[0] + # find replace value: + arg_pattern = r'\$\{(\w+)\}' + arg_matches = re.findall(arg_pattern, image) + for arg_name in arg_matches: + arg_value = get_arg_value(arg_name, decoded_string) + if arg_value: + image = image.replace(f'${{{arg_name}}}', arg_value) + # transfer alias + if image in alias: + image = alias[image] + for v in values: + if v.lower() == "as": + alias[values[-1]] = image + + images.append(image) + res['images'] = images + result.append(res) + return result + + def save_result(self): + with open(f"github-{self.graphQL.query_time}.json", "w+") as file: + file.write(json.dumps(self.codes)) + + +def get_arg_value(arg_name, dockerfile): + arg_pattern = rf'ARG\s+{arg_name}=([^\n\r]+)' + arg_match = re.search(arg_pattern, dockerfile) + + if arg_match: + return arg_match.group(1).strip().replace("\"", "").replace("'", "") + else: + return None diff --git a/github/graphQL.py b/github/graphQL.py new file mode 100644 index 0000000..b653fcd --- /dev/null +++ b/github/graphQL.py @@ -0,0 +1,89 @@ +import json +import requests +import common.timer + + +class GraphQL: + def __init__(self, github_token, date="", batch_size=100): + self.count = 0 + self.url = 'https://api.github.com/graphql' + self.batch_size = batch_size + self.token = github_token + self.query_time = common.timer.get_previous_day() + if date != "": + self.query_time = date + self.query = f'''language:Dockerfile created:{self.query_time}''' + self.repos = [] + + def fetch_all_result(self): + data = self.run_graphql_query(count_query(self.query)) + count = data["data"]['search']['repositoryCount'] + self.count = count + if count <= 1000: + self.fetch_results_batch("") + + def fetch_results_batch(self, after): + data = self.run_graphql_query(query(self.query, self.batch_size, after)) + try: + rateLimit = data["data"]["rateLimit"] + nodes = data["data"]["search"]["nodes"] + page_info = data["data"]["search"]["pageInfo"] + self.repos.extend(nodes) + + print("rate limit:", rateLimit) + print("page info:", page_info) + if page_info["hasNextPage"]: + self.fetch_results_batch(page_info["endCursor"]) + except: + print(json.dumps(data)) + + def run_graphql_query(self, body): + response = requests.post(self.url, json={'query': body}, headers={'Authorization': f'Bearer {self.token}'}) + if response.status_code == 200: + return response.json() + else: + raise Exception(f"GraphQL query failed with status code: {response.status_code}\n{response.text}") + + +def count_query(search_query): + return f''' + query{{ + search(query: "{search_query}", type: REPOSITORY, first: 1) {{ + repositoryCount + }} + }} + ''' + + +def query(search_query, first, after): + param = f'''query: "{search_query}", type: REPOSITORY,first: {first}, after:"{after}" ''' + if after == "": + param = f'''query: "{search_query}", type: REPOSITORY, first: {first}''' + return f''' + query {{ + rateLimit {{ + limit + cost + remaining + used + resetAt + nodeCount + }} + search({param}) {{ + nodes {{ + ... on Repository {{ + nameWithOwner + description + url + createdAt + stargazerCount + forkCount + }} + }} + pageInfo {{ + endCursor + hasNextPage + }} + }} + }} +''' diff --git a/main.py b/main.py index 479c39a..cd11427 100644 --- a/main.py +++ b/main.py @@ -1,114 +1,32 @@ -import base64 -import json -import re import sys -import time -import requests -from dockerfile_parse import DockerfileParser +from github.github import GithubCode -# 替换成你的GitHub Personal Access Token -ACCESS_TOKEN = '' - -result = [] -while_repo_list = ["docker_repair"] - - -def calculate_execution_time(func): - def wrapper(*args, **kwargs): - start_time = time.time() - result = func(*args, **kwargs) - end_time = time.time() - execution_time = end_time - start_time - print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.") - return result - - return wrapper - - -@calculate_execution_time -def get_dockerfile_images(): - url = 'https://api.github.com/search/code' - params = {'q': 'FROM language:Dockerfile', "page": "1", "per_page": "100"} - headers = {'Authorization': f'Bearer {ACCESS_TOKEN}', "Accept": "application/vnd.github+json"} - - response = requests.get(url, headers=headers, params=params) - response_json = response.json() - if response.status_code != 200: - print(response.json()) - parse_result(response_json) - total = response_json["total_count"] - print(total) - for page in range(2, 100): - params["page"] = str(page) - response = requests.get(url, headers=headers, params=params) - if response.status_code != 200: - print(response.json()) - response_json = response.json() - parse_result(response_json) - - with open("result.json", "w+") as file: - file.write(json.dumps(result)) - - -def parse_result(response_json): - for item in response_json["items"]: - res = dict() - res['filename'] = item["name"] - res['path'] = item["path"] - res['repo_name'] = item["repository"]["name"] - if res['repo_name'] in while_repo_list: - continue - url = item["url"] - res['detail_url'] = url - print(f"{res['filename']}:{res['path']} - {res['detail_url']}") - images = [] - # need get - resp_detail = requests.get(url, headers=headers) - resp_detail_json = resp_detail.json() - contents_base64 = resp_detail_json["content"].replace("\n", "") - decoded_bytes = base64.b64decode(contents_base64) - decoded_string = decoded_bytes.decode('utf-8') - dfp = DockerfileParser() - dfp.content = decoded_string - alias = dict() - for line in dfp.structure: - if line["instruction"] == "FROM": - values = line["value"].split(" ") - image = values[0] - # find replace value: - arg_pattern = r'\$\{(\w+)\}' - arg_matches = re.findall(arg_pattern, image) - for arg_name in arg_matches: - arg_value = get_arg_value(arg_name, decoded_string) - if arg_value: - image = image.replace(f'${{{arg_name}}}', arg_value) - # transfer alias - if image in alias: - image = alias[image] - for v in values: - if v.lower() == "as": - alias[values[-1]] = image - - images.append(image) - res['images'] = images - result.append(res) - - -def get_arg_value(arg_name, dockerfile): - arg_pattern = rf'ARG\s+{arg_name}=([^\n\r]+)' - arg_match = re.search(arg_pattern, dockerfile) +if __name__ == '__main__': + args = sys.argv[1:] + token = args[0].split(",") + date = "" + token_select = 0 + if len(args) < 1: + print("empty arg") + sys.exit() - if arg_match: - return arg_match.group(1).strip().replace("\"", "").replace("'", "") - else: - return None + if len(args) == 2: + date = args[1] + if len(args) == 3: + date = args[1] + token_select = args[2] -if __name__ == '__main__': - args = sys.argv[1:] - ACCESS_TOKEN = args[0] - if ACCESS_TOKEN == "": + if len(token) == 0: print("empty token") sys.exit() - get_dockerfile_images() + + if int(token_select) > len(token) - 1: + print("token select error") + sys.exit() + t = token[int(token_select)] + print("select token ", t) + github = GithubCode(t, date) + github.fetch_code_detail() + github.save_result()