-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
278 additions
and
114 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,17 @@ | ||
name: Collect | ||
|
||
# auto collection | ||
on: | ||
push: | ||
branches: [ "main" ] | ||
# schedule: | ||
# - cron: '0 0 * * *' | ||
workflow_dispatch: | ||
inputs: | ||
date: | ||
description: 'crawler date time' | ||
required: true | ||
token: | ||
description: 'select token' | ||
required: true | ||
schedule: | ||
- cron: '0 15 * * *' | ||
|
||
jobs: | ||
collect-github: | ||
|
@@ -18,9 +25,17 @@ jobs: | |
- name: Prepare | ||
run: pip3 install -r requirement.txt | ||
- name: Crawler | ||
run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }} | ||
run: python3 main.py ${{ secrets.CUSTOMS_GITHUB_TOKEN }} ${{ inputs.date }} ${{ inputs.token }} | ||
- name: Upload Repos | ||
run: | | ||
git config --local user.email "[email protected]" | ||
git config --local user.name "Github Action" | ||
git remote set-url origin https://${{ github.actor }}:$(echo "${{ secrets.CUSTOMS_GITHUB_TOKEN }}" | cut -d ',' -f 1)@github.com/${{ github.repository }} | ||
git pull --rebase | ||
git add . | ||
git commit -m "Auto Update `date --rfc-3339=seconds`" | ||
git push -f | ||
- uses: actions/upload-artifact@v3 | ||
with: | ||
name: result.json | ||
path: result.json | ||
|
||
name: github | ||
path: github-*.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import time | ||
from datetime import datetime, timedelta | ||
|
||
def calculate_execution_time(func): | ||
def wrapper(*args, **kwargs): | ||
start_time = time.time() | ||
result = func(*args, **kwargs) | ||
end_time = time.time() | ||
execution_time = end_time - start_time | ||
print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.") | ||
return result | ||
|
||
return wrapper | ||
|
||
|
||
def get_previous_day(): | ||
current_datetime = datetime.now() | ||
# 提取日期部分 | ||
current_date = current_datetime.date() | ||
# 使用timedelta减去一天 | ||
previous_day = current_date - timedelta(days=1) | ||
return previous_day |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import base64 | ||
import json | ||
import re | ||
import time | ||
|
||
import requests | ||
|
||
import common.timer | ||
from github.graphQL import GraphQL | ||
from dockerfile_parse import DockerfileParser | ||
|
||
|
||
class GithubCode: | ||
def __init__(self, token, date=""): | ||
self.url = 'https://api.github.com/search/code' | ||
self.headers = {'Authorization': f'Bearer {token}', "Accept": "application/vnd.github+json"} | ||
self.graphQL = GraphQL(token, date) | ||
self.codes = [] | ||
|
||
@common.timer.calculate_execution_time | ||
def fetch_code_detail(self): | ||
self.graphQL.fetch_all_result() | ||
total_repos = len(self.graphQL.repos) | ||
print(self.graphQL.query_time, "total:", total_repos) | ||
for i in range(0, total_repos): | ||
repo = self.graphQL.repos[i] | ||
response = self.fetch_detail(repo) | ||
response_json = response.json() | ||
retry_cnt = 0 | ||
while response.status_code != 200 and retry_cnt < 3: | ||
if "message" in response_json and "API rate limit" in response_json["message"]: | ||
# 触发频率限制了,休息个50s左右 | ||
print(f"rete limit, now process: {i}/{total_repos}") | ||
time.sleep(55) | ||
response = self.fetch_detail(repo) | ||
response_json = response.json() | ||
retry_cnt += 1 | ||
if retry_cnt > 3: | ||
print("error fetch repo:", repo) | ||
continue | ||
total = response_json["total_count"] | ||
print(repo["nameWithOwner"], total) | ||
repo["details"] = self.parse_result(response_json) | ||
self.codes.append(repo) | ||
|
||
def fetch_detail(self, repo): | ||
params = {'q': f'FROM repo:{repo["nameWithOwner"]} language:Dockerfile', "page": "1", "per_page": "100"} | ||
response = requests.get(self.url, headers=self.headers, params=params) | ||
return response | ||
|
||
def parse_result(self, response_json): | ||
result = [] | ||
for item in response_json["items"]: | ||
res = dict() | ||
res['filename'] = item["name"] | ||
res['path'] = item["path"] | ||
res['repo_name'] = item["repository"]["name"] | ||
url = item["url"] | ||
res['detail_url'] = url | ||
print(f"{res['filename']}:{res['path']} - {res['detail_url']}") | ||
images = [] | ||
# need get | ||
resp_detail = requests.get(url, headers=self.headers) | ||
resp_detail_json = resp_detail.json() | ||
if resp_detail.status_code != 200: | ||
print(resp_detail_json) | ||
contents_base64 = resp_detail_json["content"].replace("\n", "") | ||
decoded_bytes = base64.b64decode(contents_base64) | ||
decoded_string = decoded_bytes.decode('utf-8') | ||
dfp = DockerfileParser() | ||
dfp.content = decoded_string | ||
alias = dict() | ||
for line in dfp.structure: | ||
if line["instruction"] == "FROM": | ||
values = line["value"].split(" ") | ||
image = values[0] | ||
# find replace value: | ||
arg_pattern = r'\$\{(\w+)\}' | ||
arg_matches = re.findall(arg_pattern, image) | ||
for arg_name in arg_matches: | ||
arg_value = get_arg_value(arg_name, decoded_string) | ||
if arg_value: | ||
image = image.replace(f'${{{arg_name}}}', arg_value) | ||
# transfer alias | ||
if image in alias: | ||
image = alias[image] | ||
for v in values: | ||
if v.lower() == "as": | ||
alias[values[-1]] = image | ||
|
||
images.append(image) | ||
res['images'] = images | ||
result.append(res) | ||
return result | ||
|
||
def save_result(self): | ||
with open(f"github-{self.graphQL.query_time}.json", "w+") as file: | ||
file.write(json.dumps(self.codes)) | ||
|
||
|
||
def get_arg_value(arg_name, dockerfile): | ||
arg_pattern = rf'ARG\s+{arg_name}=([^\n\r]+)' | ||
arg_match = re.search(arg_pattern, dockerfile) | ||
|
||
if arg_match: | ||
return arg_match.group(1).strip().replace("\"", "").replace("'", "") | ||
else: | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import json | ||
import requests | ||
import common.timer | ||
|
||
|
||
class GraphQL: | ||
def __init__(self, github_token, date="", batch_size=100): | ||
self.count = 0 | ||
self.url = 'https://api.github.com/graphql' | ||
self.batch_size = batch_size | ||
self.token = github_token | ||
self.query_time = common.timer.get_previous_day() | ||
if date != "": | ||
self.query_time = date | ||
self.query = f'''language:Dockerfile created:{self.query_time}''' | ||
self.repos = [] | ||
|
||
def fetch_all_result(self): | ||
data = self.run_graphql_query(count_query(self.query)) | ||
count = data["data"]['search']['repositoryCount'] | ||
self.count = count | ||
if count <= 1000: | ||
self.fetch_results_batch("") | ||
|
||
def fetch_results_batch(self, after): | ||
data = self.run_graphql_query(query(self.query, self.batch_size, after)) | ||
try: | ||
rateLimit = data["data"]["rateLimit"] | ||
nodes = data["data"]["search"]["nodes"] | ||
page_info = data["data"]["search"]["pageInfo"] | ||
self.repos.extend(nodes) | ||
|
||
print("rate limit:", rateLimit) | ||
print("page info:", page_info) | ||
if page_info["hasNextPage"]: | ||
self.fetch_results_batch(page_info["endCursor"]) | ||
except: | ||
print(json.dumps(data)) | ||
|
||
def run_graphql_query(self, body): | ||
response = requests.post(self.url, json={'query': body}, headers={'Authorization': f'Bearer {self.token}'}) | ||
if response.status_code == 200: | ||
return response.json() | ||
else: | ||
raise Exception(f"GraphQL query failed with status code: {response.status_code}\n{response.text}") | ||
|
||
|
||
def count_query(search_query): | ||
return f''' | ||
query{{ | ||
search(query: "{search_query}", type: REPOSITORY, first: 1) {{ | ||
repositoryCount | ||
}} | ||
}} | ||
''' | ||
|
||
|
||
def query(search_query, first, after): | ||
param = f'''query: "{search_query}", type: REPOSITORY,first: {first}, after:"{after}" ''' | ||
if after == "": | ||
param = f'''query: "{search_query}", type: REPOSITORY, first: {first}''' | ||
return f''' | ||
query {{ | ||
rateLimit {{ | ||
limit | ||
cost | ||
remaining | ||
used | ||
resetAt | ||
nodeCount | ||
}} | ||
search({param}) {{ | ||
nodes {{ | ||
... on Repository {{ | ||
nameWithOwner | ||
description | ||
url | ||
createdAt | ||
stargazerCount | ||
forkCount | ||
}} | ||
}} | ||
pageInfo {{ | ||
endCursor | ||
hasNextPage | ||
}} | ||
}} | ||
}} | ||
''' |
Oops, something went wrong.