1 year ago · 5fd106c9f5
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,4 @@ cython_debug/
 
															 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
														
 
															 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
														
 
															 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
														
 
															-#.idea/
														
 
															+.idea/
														
--- a/info-spider/README.md
+++ b/info-spider/README.md
@@ -0,0 +1,56 @@
 
															+# info-spider
														
 
															+
														
 
															+一个用于调用github官方api以获取社区仓库信息的脚本
														
 
															+
														
 
															+##  使用方式
														
 
															+
														
 
															+本脚本可以直接运行，也可以作为模块被导入
														
 
															+
														
 
															+### Requirements
														
 
															+
														
 
															+需要有python运行环境并安装依赖库
														
 
															+
														
 
															+```shell
														
 
															+pip3 install -r requirements.txt
														
 
															+```
														
 
															+
														
 
															+### **配置说明**
														
 
															+
														
 
															+* 若需更改文件输出及配置文件路径，可通过更改脚本中PATH变量的值，若为空则默认为脚本同一目录下
														
 
															+
														
 
															+```python
														
 
															+PATH = " 这里填写文件输出以及配置文件路径 "
														
 
															+```
														
 
															+
														
 
															+* 使用者需要在config.json中填写如下选项以更好的使用脚本
														
 
															+  * **user** : 社区用户名，默认为DragonOS-Community
														
 
															+  * **token** : 使用者的[github token](https://github.com/settings/tokens)，用以增加访问访问次数(若不使用token则有每小时60次的访问限制，[查看详情](https://docs.github.com/zh/rest/overview/rate-limits-for-the-rest-api))
														
 
															+  * **parallel_threads** : 最大并行线程数
														
 
															+  * **black_list** : 仓库获取黑名单，列表中填写仓库的名称用于忽略该仓库中的contributor信息
														
 
															+  * **white_list** : 黑名单中的白名单，列表中填写用户名，黑名单中的仓库会忽略除了白名单中的contributor
														
 
															+
														
 
															+### 直接运行
														
 
															+
														
 
															+使用命令行执行脚本生成.json文件以及.xls文件
														
 
															+
														
 
															+```shell
														
 
															+python main.py
														
 
															+```
														
 
															+
														
 
															+### 作为模块导入
														
 
															+
														
 
															+可以调用模块中的get_json()和get_dict()
														
 
															+
														
 
															+* **get_dict()** : 返回带有社区信息的python字典
														
 
															+* **get_json()** : 返回带有社区信息的json文本
														
 
															+
														
 
															+## 添加统计条目
														
 
															+
														
 
															+如果后期需要添加社区仓库的统计条目，需要做以下改动
														
 
															+
														
 
															+1. 编写统计函数，参数为仓库信息字典，返回值字典{"条目名称":条目数据}，并在脚本头部的**function_list**中填写函数名
														
 
															+2. 将上述条目名称在脚本头部的head1中，作为最终输出在excel中的表头
														
 
															+
														
 
															+## TODO
														
 
															+
														
 
															+如果后期需要可以考虑进一步封装脚本
														
--- a/info-spider/config.json
+++ b/info-spider/config.json
@@ -0,0 +1,7 @@
 
															+{
														
 
															+  "user": "DragonOS-Community",
														
 
															+  "token": "",
														
 
															+  "parallel_threads":8,
														
 
															+  "black_list": ["grub","tar","gcc","acpi-rs","mini-backtrace","binutils","mpc","mpfr","gmp-6.2.1","flex","relibc"],
														
 
															+  "": ["fslongjin"]
														
 
															+}
														
--- a/info-spider/main.py
+++ b/info-spider/main.py
@@ -0,0 +1,223 @@
 
															+# -*- coding: UTF-8 -*-
														
 
															+import time
														
 
															+import requests
														
 
															+import json
														
 
															+from retry import retry
														
 
															+import xlwt
														
 
															+from os import path
														
 
															+from sys import stdout
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+
														
 
															+__all__ = ["get_dict", "get_json"]
														
 
															+
														
 
															+function_list = ["get_cnt", "get_pr", "get_contributors"]  # 信息获取函数
														
 
															+PATH = ""  # 文件输出路径以及配置文件存储路径，为空则默认在脚本文件同一目录下`
														
 
															+head1 = ["name", "starred", "watching", "fork", "issue", "pull_request", "contributor"]  # 表头
														
 
															+head2 = ["name", "contributions"]
														
 
															+
														
 
															+# 配置文件读取
														
 
															+try:
														
 
															+    with open(path.join(PATH, "config.json"), "r", encoding="utf-8") as f:
														
 
															+        # 配置文件选项说明
														
 
															+        dic = json.loads(f.read())
														
 
															+        USER = dic["user"]  # 目标用户
														
 
															+        TOKEN = dic["token"]  # github访问令牌，用于增加api访问次数
														
 
															+        PARALLEL = dic["parallel_threads"]  # 最并行线程数
														
 
															+        BLACKLIST = dic["black_list"]  # contributor获取的仓库黑名单
														
 
															+        WHITELIST = dic["white_list"]  # 仓库黑名单中的contributor白名单
														
 
															+
														
 
															+    pool = ThreadPoolExecutor(max_workers=PARALLEL)
														
 
															+except Exception as e:
														
 
															+    print("There are some errors while getting configure information!\n")
														
 
															+    raise e
														
 
															+
														
 
															+
														
 
															+@retry(Exception, 5, 2, 8)
														
 
															+def get_info(url):
														
 
															+    """
														
 
															+    :param url:请求的api链接
														
 
															+    :return: py字典
														
 
															+    """
														
 
															+    headers = {"Authorization": "Bearer " + TOKEN}
														
 
															+    response = requests.get(url=url, headers=headers).text
														
 
															+    return json.loads(response)
														
 
															+
														
 
															+
														
 
															+def get_repo(repo_dict):
														
 
															+    """
														
 
															+    :param repo_dict:仓库字典
														
 
															+    :return: py字典
														
 
															+    """
														
 
															+    result = {"name": str(repo_dict.get("name")), "description": repo_dict.get("description")}
														
 
															+    for fuc in function_list:
														
 
															+        result.update(eval("%s(repo_dict)" % (fuc)))
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def get_cnt(repo_dict):
														
 
															+    result = {
														
 
															+        "starred": repo_dict.get("stargazers_count"),
														
 
															+        "watching": repo_dict.get("watchers_count"),
														
 
															+        "fork": repo_dict.get("forks_count"),
														
 
															+        "issue": repo_dict.get("open_issues_count"),
														
 
															+    }
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def get_pr(repo_dict):
														
 
															+    pr_dict = get_info(r"https://api.github.com/repos/" + repo_dict["full_name"] + "/pulls")
														
 
															+    return {"pull_request": len(pr_dict)}
														
 
															+
														
 
															+
														
 
															+def get_contributors(repo_dict):
														
 
															+    result = {"contributor_list": []}
														
 
															+    contri_dict = get_info(repo_dict["contributors_url"])
														
 
															+    for dic in contri_dict:
														
 
															+        # 黑白名单实现
														
 
															+        if repo_dict["name"] in BLACKLIST or repo_dict.get("parent"):
														
 
															+            if dic["login"] not in WHITELIST:
														
 
															+                continue
														
 
															+        tmp = {
														
 
															+            "name": dic["login"],
														
 
															+            "id": dic["id"],
														
 
															+            "contributions": dic["contributions"]
														
 
															+        }
														
 
															+        result["contributor_list"].append(tmp)
														
 
															+    result["contributor"] = len(result["contributor_list"])
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def sum_up(dic):
														
 
															+    contribute_existed = {}
														
 
															+    result = {"total": {
														
 
															+        "starred": 0,
														
 
															+        "watching": 0,
														
 
															+        "fork": 0,
														
 
															+        "issue": 0,
														
 
															+        "pull_request": 0,
														
 
															+        "contributor": 0,
														
 
															+        "contributor_list": []
														
 
															+    }}
														
 
															+    pos = 0
														
 
															+    for repo in dic["repositories"]:
														
 
															+        for k in result["total"].keys():
														
 
															+            if k != "contributor_list":
														
 
															+                result["total"][k] += repo[k]
														
 
															+            else:
														
 
															+                # contributor累加
														
 
															+                for contribute in repo[k]:
														
 
															+                    if contribute_existed.get(contribute["name"]) is None:
														
 
															+                        result["total"][k].append(contribute.copy())
														
 
															+                        contribute_existed[contribute["name"]] = pos
														
 
															+                        pos += 1
														
 
															+                    else:
														
 
															+                        result["total"][k][contribute_existed[contribute["name"]]]["contributions"] += \
														
 
															+                            contribute["contributions"]
														
 
															+    result["total"]["contributor_list"].sort(key=lambda a: a["contributions"], reverse=True)
														
 
															+    result["total"]["contributor"] = len(contribute_existed)
														
 
															+    dic.update(result)
														
 
															+    return dic
														
 
															+
														
 
															+
														
 
															+def get_dict():
														
 
															+    """
														
 
															+    :return:带有信息的py字典
														
 
															+    """
														
 
															+    # 获取用户信息
														
 
															+    info_dict = {"repositories": []}
														
 
															+    root_dict = get_info(r"https://api.github.com/users/" + USER + r"/repos")
														
 
															+
														
 
															+    # 解析信息
														
 
															+    def thread(dic):
														
 
															+        result = get_repo(dic)
														
 
															+        info_dict["repositories"].append(result)
														
 
															+        return 1
														
 
															+
														
 
															+    # 分别获取每个仓库
														
 
															+    thread_list = []
														
 
															+    wrong_list = []
														
 
															+    for dic in root_dict:
														
 
															+        thread_list.append(pool.submit(thread, dic))
														
 
															+        time.sleep(0.05)
														
 
															+        # 等待线程完毕
														
 
															+    while thread_list:
														
 
															+        for x in thread_list:
														
 
															+            if x.done() and x.result():
														
 
															+                thread_list.remove(x)
														
 
															+            elif x.done() and not x.result():
														
 
															+                wrong_list.append(x.exception())
														
 
															+                thread_list.remove(x)
														
 
															+            stdout.write('\r %d threads left. . .' % (len(thread_list)))
														
 
															+
														
 
															+    # 输出线程完成情况
														
 
															+    stdout.write('\r Done!During the process,%d exceptions have been raised. . . ' % (len(wrong_list)))
														
 
															+    stdout.flush()
														
 
															+
														
 
															+    if len(wrong_list):
														
 
															+        for i in wrong_list:
														
 
															+            stdout.write(str(i) + "\n")
														
 
															+            stdout.flush()
														
 
															+
														
 
															+    # 按名字字母排序
														
 
															+    info_dict["repositories"].sort(key=lambda a: a["name"].lower())
														
 
															+    return sum_up(info_dict)
														
 
															+
														
 
															+
														
 
															+def get_json(dic=None):
														
 
															+    """
														
 
															+    :return:带有信息的json文本
														
 
															+    """
														
 
															+    if not dic:
														
 
															+        return json.dumps(get_dict(), sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False)
														
 
															+    else:
														
 
															+        return json.dumps(dic, sort_keys=False, indent=4, separators=(',', ':'), ensure_ascii=False)
														
 
															+
														
 
															+
														
 
															+def wt_json(text):
														
 
															+    if PATH:
														
 
															+        with open(path.join(PATH, "github_info.json"), "w", encoding="utf-8") as f:
														
 
															+            f.write(text)
														
 
															+            f.flush()
														
 
															+    else:
														
 
															+        with open(path.join(PATH, "github_info.json"), "w", encoding="utf-8") as f:
														
 
															+            f.write(text)
														
 
															+            f.flush()
														
 
															+
														
 
															+
														
 
															+def wt_excel(dic):
														
 
															+    wb = xlwt.Workbook()
														
 
															+    # try:
														
 
															+    # 写入仓库数据
														
 
															+    tb1 = wb.add_sheet("repositories", cell_overwrite_ok=True)
														
 
															+    for i in range(len(head1)):
														
 
															+        tb1.write(0, i, head1[i])
														
 
															+    for i in range(len(dic["repositories"])):
														
 
															+        for j in range(len(head1)):
														
 
															+            tb1.write(i + 1, j, dic["repositories"][i][head1[j]])
														
 
															+    # 写入总计数据
														
 
															+    for i in range(len(head1)):
														
 
															+        if head1[i] == "name":
														
 
															+            tb1.write(len(dic["repositories"]) + 1, i, "Total")
														
 
															+            continue
														
 
															+        # if type(dic["total"][head1[i]]) == ("dict" or "list"):
														
 
															+        #     tb1.write(len(dic["repositories"]) + 2, i, len(dic["total"][head1[i]]))
														
 
															+        # else:
														
 
															+        tb1.write(len(dic["repositories"]) + 1, i, dic["total"][head1[i]])
														
 
															+    # 写入贡献者名单
														
 
															+    tb2 = wb.add_sheet("contributor list", cell_overwrite_ok=True)
														
 
															+    for i in range(len(head2)):
														
 
															+        tb2.write(0, i, head2[i])
														
 
															+    for i in range(len(dic["total"]["contributor_list"])):
														
 
															+        for j in range(len(head2)):
														
 
															+            tb2.write(i + 1, j, dic["total"]["contributor_list"][i][head2[j]])
														
 
															+
														
 
															+        # except Exception as e:
														
 
															+        #     print("\n")
														
 
															+        #     print(e)
														
 
															+        wb.save(path.join(PATH, "statistics.xls"))
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    dic = get_dict()
														
 
															+    wt_json(get_json(dic))
														
 
															+    wt_excel(dic)
														
--- a/info-spider/requirements.txt
+++ b/info-spider/requirements.txt
@@ -0,0 +1,3 @@
 
															+requests~=2.31.0
														
 
															+xlwt~=1.3.0
														
 
															+retry~=0.9.2