Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
DVKunion committed Jul 31, 2023
1 parent fa7a527 commit 8b932ff
Show file tree
Hide file tree
Showing 9 changed files with 278 additions and 114 deletions.
31 changes: 23 additions & 8 deletions .github/workflows/collect.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
name: Collect

# auto collection
on:
push:
branches: [ "main" ]
# schedule:
# - cron: '0 0 * * *'
workflow_dispatch:
inputs:
date:
description: 'crawler date time'
required: true
token:
description: 'select token'
required: true
schedule:
- cron: '0 15 * * *'

jobs:
collect-github:
Expand All @@ -18,9 +25,17 @@ jobs:
- name: Prepare
run: pip3 install -r requirement.txt
- name: Crawler
run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }}
run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }} ${{ inputs.date }} ${{ input.token }}
- name: Upload Repos
run: |
git config --local user.email "[email protected]"
git config --local user.name "Github Action"
git remote set-url origin https://${{ github.actor }}:${{ secrets.CUSTOMS_GITHUB_TOKEN }}@github.com/${{ github.repository }}
git pull --rebase
git add .
git commit -m "Auto Update `date --rfc-3339=seconds`"
git push -f
- uses: actions/upload-artifact@v3
with:
name: result.json
path: result.json

name: github
path: github-*.json
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,4 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/
Dockerfile
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,14 @@

镜像海关 - Github Action自动化 Docker 镜像数据收集与分析,为云安全研究提供数据支持。

基于`GitHub GraphQL API v4` ,获取存在`Dockerfile`的前一天创建的仓库,通过`REST API`去定向搜索具体的`code`,获取所有`FROM`字段

依此来逃过`search code API`最多只能获取到1000个的限制。

## 一些存在的问题:

很多repo没有indexed,导致搜索时报错:

`This repository's code is being indexed right now. Try again in a few minutes.`

search接口的limit为 10次/分钟,只能依照这个速率进行爬取。
Empty file added common/__init__.py
Empty file.
22 changes: 22 additions & 0 deletions common/timer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import time
from datetime import datetime, timedelta

def calculate_execution_time(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
execution_time = end_time - start_time
print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.")
return result

return wrapper


def get_previous_day():
current_datetime = datetime.now()
# 提取日期部分
current_date = current_datetime.date()
# 使用timedelta减去一天
previous_day = current_date - timedelta(days=1)
return previous_day
Empty file added github/__init__.py
Empty file.
108 changes: 108 additions & 0 deletions github/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import base64
import json
import re
import time

import requests

import common.timer
from github.graphQL import GraphQL
from dockerfile_parse import DockerfileParser


class GithubCode:
def __init__(self, token, date=""):
self.url = 'https://api.github.com/search/code'
self.headers = {'Authorization': f'Bearer {token}', "Accept": "application/vnd.github+json"}
self.graphQL = GraphQL(token, date)
self.codes = []

@common.timer.calculate_execution_time
def fetch_code_detail(self):
self.graphQL.fetch_all_result()
total_repos = len(self.graphQL.repos)
print(self.graphQL.query_time, "total:", total_repos)
for i in range(0, total_repos):
repo = self.graphQL.repos[i]
response = self.fetch_detail(repo)
response_json = response.json()
retry_cnt = 0
while response.status_code != 200 and retry_cnt < 3:
if "message" in response_json and "API rate limit" in response_json["message"]:
# 触发频率限制了,休息个50s左右
print(f"rete limit, now process: {i}/{total_repos}")
time.sleep(55)
response = self.fetch_detail(repo)
response_json = response.json()
retry_cnt += 1
if retry_cnt > 3:
print("error fetch repo:", repo)
continue
total = response_json["total_count"]
print(repo["nameWithOwner"], total)
repo["details"] = self.parse_result(response_json)
self.codes.append(repo)

def fetch_detail(self, repo):
params = {'q': f'FROM repo:{repo["nameWithOwner"]} language:Dockerfile', "page": "1", "per_page": "100"}
response = requests.get(self.url, headers=self.headers, params=params)
return response

def parse_result(self, response_json):
result = []
for item in response_json["items"]:
res = dict()
res['filename'] = item["name"]
res['path'] = item["path"]
res['repo_name'] = item["repository"]["name"]
url = item["url"]
res['detail_url'] = url
print(f"{res['filename']}:{res['path']} - {res['detail_url']}")
images = []
# need get
resp_detail = requests.get(url, headers=self.headers)
resp_detail_json = resp_detail.json()
if resp_detail.status_code != 200:
print(resp_detail_json)
contents_base64 = resp_detail_json["content"].replace("\n", "")
decoded_bytes = base64.b64decode(contents_base64)
decoded_string = decoded_bytes.decode('utf-8')
dfp = DockerfileParser()
dfp.content = decoded_string
alias = dict()
for line in dfp.structure:
if line["instruction"] == "FROM":
values = line["value"].split(" ")
image = values[0]
# find replace value:
arg_pattern = r'\$\{(\w+)\}'
arg_matches = re.findall(arg_pattern, image)
for arg_name in arg_matches:
arg_value = get_arg_value(arg_name, decoded_string)
if arg_value:
image = image.replace(f'${{{arg_name}}}', arg_value)
# transfer alias
if image in alias:
image = alias[image]
for v in values:
if v.lower() == "as":
alias[values[-1]] = image

images.append(image)
res['images'] = images
result.append(res)
return result

def save_result(self):
with open(f"github-{self.graphQL.query_time}.json", "w+") as file:
file.write(json.dumps(self.codes))


def get_arg_value(arg_name, dockerfile):
arg_pattern = rf'ARG\s+{arg_name}=([^\n\r]+)'
arg_match = re.search(arg_pattern, dockerfile)

if arg_match:
return arg_match.group(1).strip().replace("\"", "").replace("'", "")
else:
return None
89 changes: 89 additions & 0 deletions github/graphQL.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import json
import requests
import common.timer


class GraphQL:
def __init__(self, github_token, date="", batch_size=100):
self.count = 0
self.url = 'https://api.github.com/graphql'
self.batch_size = batch_size
self.token = github_token
self.query_time = common.timer.get_previous_day()
if date != "":
self.query_time = date
self.query = f'''language:Dockerfile created:{self.query_time}'''
self.repos = []

def fetch_all_result(self):
data = self.run_graphql_query(count_query(self.query))
count = data["data"]['search']['repositoryCount']
self.count = count
if count <= 1000:
self.fetch_results_batch("")

def fetch_results_batch(self, after):
data = self.run_graphql_query(query(self.query, self.batch_size, after))
try:
rateLimit = data["data"]["rateLimit"]
nodes = data["data"]["search"]["nodes"]
page_info = data["data"]["search"]["pageInfo"]
self.repos.extend(nodes)

print("rate limit:", rateLimit)
print("page info:", page_info)
if page_info["hasNextPage"]:
self.fetch_results_batch(page_info["endCursor"])
except:
print(json.dumps(data))

def run_graphql_query(self, body):
response = requests.post(self.url, json={'query': body}, headers={'Authorization': f'Bearer {self.token}'})
if response.status_code == 200:
return response.json()
else:
raise Exception(f"GraphQL query failed with status code: {response.status_code}\n{response.text}")


def count_query(search_query):
return f'''
query{{
search(query: "{search_query}", type: REPOSITORY, first: 1) {{
repositoryCount
}}
}}
'''


def query(search_query, first, after):
param = f'''query: "{search_query}", type: REPOSITORY,first: {first}, after:"{after}" '''
if after == "":
param = f'''query: "{search_query}", type: REPOSITORY, first: {first}'''
return f'''
query {{
rateLimit {{
limit
cost
remaining
used
resetAt
nodeCount
}}
search({param}) {{
nodes {{
... on Repository {{
nameWithOwner
description
url
createdAt
stargazerCount
forkCount
}}
}}
pageInfo {{
endCursor
hasNextPage
}}
}}
}}
'''
Loading

0 comments on commit 8b932ff

Please sign in to comment.