2024-07-05 01:33:53 +00:00
|
|
|
|
import json
|
2024-07-09 04:05:40 +00:00
|
|
|
|
import os
|
|
|
|
|
import zipfile
|
2024-07-12 09:41:56 +00:00
|
|
|
|
import git
|
|
|
|
|
import urllib.parse
|
|
|
|
|
import re
|
2024-07-05 01:33:53 +00:00
|
|
|
|
|
2024-07-12 09:41:56 +00:00
|
|
|
|
def is_valid_json(json_string):
|
|
|
|
|
try:
|
|
|
|
|
match = re.search(r'\{.*\}', json_string, re.DOTALL)
|
|
|
|
|
if match:
|
|
|
|
|
dict_str = match.group()
|
|
|
|
|
json.loads(dict_str)
|
|
|
|
|
else:
|
|
|
|
|
json.loads(json_string)
|
|
|
|
|
return True
|
|
|
|
|
except ValueError:
|
|
|
|
|
return False
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-12 09:41:56 +00:00
|
|
|
|
def clone_repo(repo_url, clone_to):
|
|
|
|
|
"""
|
|
|
|
|
克隆一个GitHub仓库。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
repo_url (str): 原始仓库的URL。
|
|
|
|
|
clone_to (str): 克隆到的本地目录。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
str: 成功时返回克隆到的本地目录(包含子目录),不成功时返回空字符串。
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
if not os.path.exists(clone_to):
|
|
|
|
|
os.makedirs(clone_to)
|
|
|
|
|
|
|
|
|
|
# 从URL中提取仓库名称
|
|
|
|
|
repo_name = urllib.parse.urlparse(repo_url).path.split('/')[-1]
|
|
|
|
|
|
|
|
|
|
# 在clone_to目录下创建新的目录
|
|
|
|
|
cloned_path = os.path.join(clone_to, repo_name)
|
|
|
|
|
if os.path.exists(cloned_path):
|
|
|
|
|
return cloned_path
|
|
|
|
|
|
|
|
|
|
# 克隆仓库
|
|
|
|
|
repo = git.Repo.clone_from(repo_url, cloned_path)
|
|
|
|
|
|
|
|
|
|
print(f"Repository cloned to {cloned_path}")
|
|
|
|
|
return cloned_path
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Failed to clone repository: {e}")
|
|
|
|
|
return None
|
2024-07-05 01:33:53 +00:00
|
|
|
|
def unzip_file(zip_path, extract_dir):
|
|
|
|
|
"""
|
|
|
|
|
解压zip文件到指定目录,并在指定目录下创建一个新的目录存放解压后的文件
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
参数:
|
|
|
|
|
zip_path (str): zip压缩包的地址
|
|
|
|
|
extract_dir (str): 指定解压的目录
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
返回:
|
|
|
|
|
str: 解压后的路径
|
|
|
|
|
"""
|
|
|
|
|
if not os.path.exists(extract_dir):
|
|
|
|
|
os.makedirs(extract_dir)
|
|
|
|
|
|
|
|
|
|
base_name = os.path.basename(zip_path)
|
|
|
|
|
dir_name = os.path.splitext(base_name)[0]
|
|
|
|
|
new_extract_dir = os.path.join(extract_dir, dir_name)
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
if not os.path.exists(new_extract_dir):
|
|
|
|
|
os.makedirs(new_extract_dir)
|
|
|
|
|
|
2024-07-09 03:37:30 +00:00
|
|
|
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
2024-07-05 01:33:53 +00:00
|
|
|
|
zip_ref.extractall(new_extract_dir)
|
|
|
|
|
|
|
|
|
|
return new_extract_dir
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_project_files_with_content(project_dir):
|
|
|
|
|
"""
|
|
|
|
|
获取项目目录下所有文件的相对路径和内容
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
参数:
|
|
|
|
|
project_dir (str): 项目目录地址
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
返回:
|
|
|
|
|
list: 包含字典的列表,每个字典包含文件的相对路径和内容
|
|
|
|
|
"""
|
|
|
|
|
files_list = []
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
for root, dirs, files in os.walk(project_dir):
|
|
|
|
|
for file in files:
|
|
|
|
|
if filter_data(file):
|
|
|
|
|
file_path = os.path.join(root, file)
|
|
|
|
|
relative_path = os.path.relpath(file_path, project_dir)
|
|
|
|
|
if "__MACOSX" in relative_path:
|
|
|
|
|
continue
|
2024-07-09 03:37:30 +00:00
|
|
|
|
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
2024-07-05 01:33:53 +00:00
|
|
|
|
content = f.read()
|
2024-07-09 03:37:30 +00:00
|
|
|
|
files_list.append({"path": relative_path, "content": content})
|
2024-07-05 01:33:53 +00:00
|
|
|
|
else:
|
|
|
|
|
continue
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
return files_list
|
|
|
|
|
|
2024-07-09 03:37:30 +00:00
|
|
|
|
|
2024-07-05 01:33:53 +00:00
|
|
|
|
def filter_data(obj):
|
|
|
|
|
LANGUAGE_TAG = {
|
2024-07-09 03:37:30 +00:00
|
|
|
|
"c++": "// C++",
|
|
|
|
|
"cpp": "// C++",
|
|
|
|
|
"c": "// C",
|
|
|
|
|
"c#": "// C#",
|
|
|
|
|
"c-sharp": "// C#",
|
|
|
|
|
"css": "/* CSS */",
|
|
|
|
|
"cuda": "// Cuda",
|
|
|
|
|
"fortran": "! Fortran",
|
|
|
|
|
"go": "// Go",
|
|
|
|
|
"html": "<!-- HTML -->",
|
|
|
|
|
"java": "// Java",
|
|
|
|
|
"js": "// JavaScript",
|
|
|
|
|
"javascript": "// JavaScript",
|
|
|
|
|
"kotlin": "// Kotlin",
|
|
|
|
|
"lean": "-- Lean",
|
|
|
|
|
"lua": "-- Lua",
|
|
|
|
|
"objectivec": "// Objective-C",
|
|
|
|
|
"objective-c": "// Objective-C",
|
|
|
|
|
"objective-c++": "// Objective-C++",
|
|
|
|
|
"pascal": "// Pascal",
|
|
|
|
|
"php": "// PHP",
|
|
|
|
|
"python": "# Python",
|
|
|
|
|
"r": "# R",
|
|
|
|
|
"rust": "// Rust",
|
|
|
|
|
"ruby": "# Ruby",
|
|
|
|
|
"scala": "// Scala",
|
|
|
|
|
"shell": "# Shell",
|
|
|
|
|
"sql": "-- SQL",
|
|
|
|
|
"tex": f"% TeX",
|
|
|
|
|
"typescript": "// TypeScript",
|
|
|
|
|
"vue": "<!-- Vue -->",
|
|
|
|
|
"assembly": "; Assembly",
|
|
|
|
|
"dart": "// Dart",
|
|
|
|
|
"perl": "# Perl",
|
|
|
|
|
"prolog": f"% Prolog",
|
|
|
|
|
"swift": "// swift",
|
|
|
|
|
"lisp": "; Lisp",
|
|
|
|
|
"vb": "' Visual Basic",
|
|
|
|
|
"visual basic": "' Visual Basic",
|
|
|
|
|
"matlab": f"% Matlab",
|
|
|
|
|
"delphi": "{ Delphi }",
|
|
|
|
|
"scheme": "; Scheme",
|
|
|
|
|
"basic": "' Basic",
|
|
|
|
|
"groovy": "// Groovy",
|
|
|
|
|
"abap": "* Abap",
|
|
|
|
|
"gdscript": "# GDScript",
|
|
|
|
|
"haskell": "-- Haskell",
|
|
|
|
|
"julia": "# Julia",
|
|
|
|
|
"elixir": "# Elixir",
|
|
|
|
|
"excel": "' Excel",
|
|
|
|
|
"clojure": "; Clojure",
|
|
|
|
|
"actionscript": "// ActionScript",
|
|
|
|
|
"solidity": "// Solidity",
|
|
|
|
|
"powershell": "# PowerShell",
|
|
|
|
|
"erlang": f"% Erlang",
|
|
|
|
|
"cobol": "// Cobol",
|
|
|
|
|
"batchfile": ":: Batch file",
|
|
|
|
|
"makefile": "# Makefile",
|
|
|
|
|
"dockerfile": "# Dockerfile",
|
|
|
|
|
"markdown": "<!-- Markdown -->",
|
|
|
|
|
"cmake": "# CMake",
|
2024-07-05 01:33:53 +00:00
|
|
|
|
}
|
|
|
|
|
|
2024-07-09 03:37:30 +00:00
|
|
|
|
programming_languages_to_file_extensions = json.load(
|
|
|
|
|
open("utils/programming-languages-to-file-extensions.json")
|
|
|
|
|
)
|
2024-07-05 01:33:53 +00:00
|
|
|
|
need2del = []
|
|
|
|
|
for key in programming_languages_to_file_extensions.keys():
|
|
|
|
|
if key.lower() not in LANGUAGE_TAG:
|
|
|
|
|
need2del.append(key)
|
|
|
|
|
|
|
|
|
|
for key in need2del:
|
|
|
|
|
del programming_languages_to_file_extensions[key]
|
|
|
|
|
|
|
|
|
|
ext_to_programming_languages = {}
|
|
|
|
|
want_languages = []
|
|
|
|
|
for key in programming_languages_to_file_extensions:
|
|
|
|
|
for item in programming_languages_to_file_extensions[key]:
|
|
|
|
|
ext_to_programming_languages[item] = key
|
|
|
|
|
want_languages.append(item)
|
|
|
|
|
|
2024-07-09 03:37:30 +00:00
|
|
|
|
ext = "." + obj.split(".")[-1]
|
|
|
|
|
with open("utils/keep.txt", "r") as f:
|
2024-07-05 01:33:53 +00:00
|
|
|
|
keep_files = f.readlines()
|
|
|
|
|
keep_files = [l.strip() for l in keep_files]
|
2024-07-09 03:37:30 +00:00
|
|
|
|
# print(ext)
|
2024-07-05 01:33:53 +00:00
|
|
|
|
if ext not in want_languages:
|
|
|
|
|
if obj in keep_files:
|
|
|
|
|
return True
|
|
|
|
|
return False
|
|
|
|
|
else:
|
|
|
|
|
return True
|