fix pep8 error

2024-11-23 12:16:33 +00:00 · 2024-07-09 11:37:30 +08:00 · 2024-07-09 11:37:30 +08:00 · 25870dc0a1
commit 25870dc0a1
parent 48fff1449b
7 changed files with 289 additions and 262 deletions
--- a/repodemo/llm/api/codegeex4.py
+++ b/repodemo/llm/api/codegeex4.py
@ -1,41 +1,39 @@
 import requests
 import json
-URL = "" #the url you deploy codegeex service
+URL = ""  # the url you deploy codegeex service
 def codegeex4(prompt, temperature=0.8, top_p=0.8):
    url = URL
-    headers = {
+    headers = {"Content-Type": "application/json"}
        'Content-Type': 'application/json'
    }
    data = {
-        'inputs': prompt,
+        "inputs": prompt,
-        'parameters': {
+        "parameters": {
-            'best_of':1,
+            "best_of": 1,
-            'do_sample': True,
+            "do_sample": True,
-            'max_new_tokens': 4012,
+            "max_new_tokens": 4012,
-            'temperature': temperature,
+            "temperature": temperature,
-            'top_p': top_p,
+            "top_p": top_p,
-            'stop': ["<|endoftext|>", "<|user|>", "<|observation|>", "<|assistant|>"],
+            "stop": ["<|endoftext|>", "<|user|>", "<|observation|>", "<|assistant|>"],
-        }
+        },
    }
    response = requests.post(url, json=data, headers=headers, verify=False, stream=True)
    if response.status_code == 200:
        for line in response.iter_lines():
            if line:
-                decoded_line = line.decode('utf-8').replace('data:', '').strip()
+                decoded_line = line.decode("utf-8").replace("data:", "").strip()
                if decoded_line:
                    try:
                        content = json.loads(decoded_line)
-                        token_text = content.get('token', {}).get('text', '')
+                        token_text = content.get("token", {}).get("text", "")
-                        if '<|endoftext|>' in token_text:
+                        if "<|endoftext|>" in token_text:
                            break
                        yield token_text
                    except json.JSONDecodeError:
                        continue
    else:
-        print('请求失败:', response.status_code)
+        print("请求失败:", response.status_code)
--- a/repodemo/llm/local/codegeex4.py
+++ b/repodemo/llm/local/codegeex4.py
@ -3,45 +3,49 @@ from transformers import AutoModel, AutoTokenizer
 from typing import Iterator
 import torch
-class CodegeexChatModel():
+
 class CodegeexChatModel:
    device: str = Field(description="device to load the model")
    tokenizer = Field(description="model's tokenizer")
    model = Field(description="Codegeex model")
    temperature: float = Field(description="temperature to use for the model.")
-    def __init__(self,model_name_or_path):
+    def __init__(self, model_name_or_path):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
-        self.model = AutoModel.from_pretrained(
+            model_name_or_path, trust_remote_code=True
-            model_name_or_path,
+        )
-            trust_remote_code=True
+        self.model = (
-        ).to(self.device).eval()
+            AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
            .to(self.device)
            .eval()
        )
        print("Model has been initialized.")
-    def chat(self, prompt,temperature=0.2,top_p=0.95):
+    def chat(self, prompt, temperature=0.2, top_p=0.95):
        try:
            response, _ = self.model.chat(
                self.tokenizer,
                query=prompt,
                max_length=120000,
                temperature=temperature,
-                top_p=top_p
+                top_p=top_p,
            )
            return response
        except Exception as e:
            return f"error:{e}"
-    def stream_chat(self,prompt,temperature=0.2,top_p=0.95):
+    def stream_chat(self, prompt, temperature=0.2, top_p=0.95):
        try:
            for response, _ in self.model.stream_chat(
-                    self.tokenizer,
+                self.tokenizer,
-                    query=prompt,
+                query=prompt,
-                    max_length=120000,
+                max_length=120000,
-                    temperature=temperature,
+                temperature=temperature,
-                    top_p=top_p
+                top_p=top_p,
            ):
                yield response
        except Exception as e:
-            yield f'error: {e}'
+            yield f"error: {e}"
--- a/repodemo/prompts/base_prompt.py
+++ b/repodemo/prompts/base_prompt.py
@ -4,7 +4,7 @@ repo_system_prompt = """<|system|>\n你是一位智能编程助手，你叫CodeG
 judge_task_prompt = """<|system|>\n你是一位任务分类专家，请你对用户的输入进行分类（问答/修改/正常），如果用户的输入是对项目进行提问则只需要输出问答两个字，如果用户的输入是对项目进行修改或增加则只需要输出修改两个字，如果用户输入的是一个与项目无关的问题则只需要输出正常两个字。<|user|>\n{user_input}<|assistant|>\n"""
-web_judge_task_prompt ="""<|system|>\n你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。<|user|>\n{user_input}\n这个问题需要进行联网来回答吗？仅回答“是”或者“否”。<|assistant|>\n"""
+web_judge_task_prompt = """<|system|>\n你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。<|user|>\n{user_input}\n这个问题需要进行联网来回答吗？仅回答“是”或者“否”。<|assistant|>\n"""
 # judge_task_prompt = """<|system|>\n你是一位任务分类专家，请你对用户的输入进行分类（问答/修改），如果用户的输入是对项目进行提问则只需要输出问答两个字，如果用户的输入是对项目进行修改或增加则只需要输出修改两个字。<|user|>\n{user_input}<|assistant|>\n"""
 web_search_prompy = """
@ -19,24 +19,27 @@ web_search_prompy = """
 - 除了代码和特定的名称和引用外，您的答案必须使用与问题相同的语言来撰写。
 """.lstrip()
-def get_cur_base_user_prompt(message_history,index_prompt = None,judge_context = ""):
+
 def get_cur_base_user_prompt(message_history, index_prompt=None, judge_context=""):
    user_prompt_tmp = """<|user|>\n{user_input}"""
    assistant_prompt_tmp = """<|assistant|>\n{assistant_input}"""
    history_prompt = ""
-    for i,message in enumerate(message_history):
+    for i, message in enumerate(message_history):
-        if message['role'] == 'user':
+        if message["role"] == "user":
-            if i==0 and index_prompt is not None:
+            if i == 0 and index_prompt is not None:
-                history_prompt += "<|user|>\n"+index_prompt+message['content']
+                history_prompt += "<|user|>\n" + index_prompt + message["content"]
            else:
-                history_prompt += user_prompt_tmp.format(user_input=message['content'])
+                history_prompt += user_prompt_tmp.format(user_input=message["content"])
-        elif message['role'] ==  'assistant':
+        elif message["role"] == "assistant":
-            history_prompt += assistant_prompt_tmp.format(assistant_input=message['content'])
+            history_prompt += assistant_prompt_tmp.format(
                assistant_input=message["content"]
            )
    # print("修改" not in judge_context)
    # print(judge_context)
    if "修改" not in judge_context:
-        result = base_system_prompt+history_prompt+"""<|assistant|>\n"""
+        result = base_system_prompt + history_prompt + """<|assistant|>\n"""
    else:
-        result = repo_system_prompt+history_prompt+"""<|assistant|>\n"""
+        result = repo_system_prompt + history_prompt + """<|assistant|>\n"""
    print(result)
    return result
--- a/repodemo/run.py
+++ b/repodemo/run.py
@ -1,8 +1,12 @@
 import chainlit as cl
 from chainlit.input_widget import Slider
 from llm.api.codegeex4 import codegeex4
-from prompts.base_prompt import judge_task_prompt,get_cur_base_user_prompt,web_judge_task_prompt
+from prompts.base_prompt import (
-from utils.tools import unzip_file,get_project_files_with_content
+    judge_task_prompt,
    get_cur_base_user_prompt,
    web_judge_task_prompt,
 )
 from utils.tools import unzip_file, get_project_files_with_content
 from utils.bingsearch import bing_search_prompt
@ -12,41 +16,33 @@ async def chat_profile():
        cl.ChatProfile(
            name="chat聊天",
            markdown_description="聊天demo：支持多轮对话。",
-            starters = [
+            starters=[
                cl.Starter(
-                label="请你用python写一个快速排序。",
+                    label="请你用python写一个快速排序。",
-                message="请你用python写一个快速排序。",
+                    message="请你用python写一个快速排序。",
                ),
-
+                cl.Starter(
-            cl.Starter(
+                    label="请你介绍一下自己。",
-                label="请你介绍一下自己。",
+                    message="请你介绍一下自己。",
                message="请你介绍一下自己。",
                ),
-            cl.Starter(
+                cl.Starter(
-                label="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
+                    label="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
-                message="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
+                    message="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
                ),
-            cl.Starter(
+                cl.Starter(
-                label="我是一个python初学者，请你告诉我怎么才能学好python。",
+                    label="我是一个python初学者，请你告诉我怎么才能学好python。",
-                message="我是一个python初学者，请你告诉我怎么才能学好python。",
+                    message="我是一个python初学者，请你告诉我怎么才能学好python。",
-                
+                ),
-                )
+            ],
            ]
        ),
        cl.ChatProfile(
            name="联网问答",
-            markdown_description="联网能力dome：支持联网回答用户问题。",
+            markdown_description="联网能力demo：支持联网回答用户问题。",
        ),
        cl.ChatProfile(
            name="上传本地项目",
-            markdown_description="项目级能力dome：支持上传本地zip压缩包项目，可以进行项目问答和对项目进行修改。",
+            markdown_description="项目级能力demo：支持上传本地zip压缩包项目，可以进行项目问答和对项目进行修改。",
-            
+        ),
        )
    ]
@ -74,34 +70,32 @@ async def start():
    ).send()
    temperature = settings["temperature"]
    top_p = settings["top_p"]
-    cl.user_session.set('temperature',temperature)
+    cl.user_session.set("temperature", temperature)
-    cl.user_session.set('top_p',top_p)
+    cl.user_session.set("top_p", top_p)
-    cl.user_session.set(
+    cl.user_session.set("message_history", [])
        "message_history",
        []
    )
    chat_profile = cl.user_session.get("chat_profile")
-    extract_dir = 'repodata'
+    extract_dir = "repodata"
    if chat_profile == "chat聊天":
        pass
-    elif chat_profile =="上传本地项目":
+    elif chat_profile == "上传本地项目":
        files = None
        while files == None:
            files = await cl.AskFileMessage(
-                content="请上传项目zip压缩文件!", accept={"application/zip": [".zip"]},max_size_mb=50
+                content="请上传项目zip压缩文件!",
                accept={"application/zip": [".zip"]},
                max_size_mb=50,
            ).send()
        text_file = files[0]
-        extracted_path = unzip_file(text_file.path,extract_dir)
+        extracted_path = unzip_file(text_file.path, extract_dir)
        files_list = get_project_files_with_content(extracted_path)
-        cl.user_session.set("project_index",files_list)
+        cl.user_session.set("project_index", files_list)
-        if len(files_list)>0:
+        if len(files_list) > 0:
            await cl.Message(
                content=f"已成功上传，您可以开始对项目进行提问！",
            ).send()
@cl.on_message
 async def main(message: cl.Message):
    chat_profile = cl.user_session.get("chat_profile")
@ -110,42 +104,56 @@ async def main(message: cl.Message):
    if chat_profile == "chat聊天":
        prompt_content = get_cur_base_user_prompt(message_history=message_history)
-    elif chat_profile=="联网问答":
+    elif chat_profile == "联网问答":
-        judge_tmp = codegeex4(web_judge_task_prompt.format(user_input=message.content),temperature=0.2,top_p = 0.95)
+        judge_tmp = codegeex4(
-        judge_context = '\n'.join(judge_tmp)
+            web_judge_task_prompt.format(user_input=message.content),
            temperature=0.2,
            top_p=0.95,
        )
        judge_context = "\n".join(judge_tmp)
        print(judge_context)
        message_history.pop()
-        if '是' in judge_context:
+        if "是" in judge_context:
            prompt_tmp = bing_search_prompt(message.content)
            message_history.append({"role": "user", "content": prompt_tmp})
        else:
            message_history.append({"role": "user", "content": message.content})
        prompt_content = get_cur_base_user_prompt(message_history=message_history)
-    elif chat_profile =="上传本地项目" :
+    elif chat_profile == "上传本地项目":
-        judge_tmp = codegeex4(judge_task_prompt.format(user_input=message.content),temperature=0.2,top_p = 0.95)
+        judge_tmp = codegeex4(
-        judge_context = ''
+            judge_task_prompt.format(user_input=message.content),
            temperature=0.2,
            top_p=0.95,
        )
        judge_context = ""
        for part in judge_tmp:
-            judge_context+=part
+            judge_context += part
        project_index = cl.user_session.get("project_index")
        index_prompt = ""
        index_tmp = """###PATH:{path}\n{code}\n"""
        for index in project_index:
-            index_prompt+=index_tmp.format(path=index['path'],code=index['content'])
+            index_prompt += index_tmp.format(path=index["path"], code=index["content"])
        print(judge_context)
-        prompt_content = get_cur_base_user_prompt(message_history=message_history,index_prompt=index_prompt,judge_context=judge_context) if '正常' not in judge_context else get_cur_base_user_prompt(message_history=message_history)
+        prompt_content = (
-    
+            get_cur_base_user_prompt(
-    
+                message_history=message_history,
                index_prompt=index_prompt,
                judge_context=judge_context,
            )
            if "正常" not in judge_context
            else get_cur_base_user_prompt(message_history=message_history)
        )
    msg = cl.Message(content="")
    await msg.send()
    temperature = cl.user_session.get("temperature")
-    top_p = cl.user_session.get('top_p')
+    top_p = cl.user_session.get("top_p")
-    if len(prompt_content)/4<120000:
+    if len(prompt_content) / 4 < 120000:
-        stream =  codegeex4(prompt_content,temperature=temperature,top_p = top_p)
+        stream = codegeex4(prompt_content, temperature=temperature, top_p=top_p)
        for part in stream:
            if token := (part or " "):
--- a/repodemo/run_local.py
+++ b/repodemo/run_local.py
@ -1,63 +1,62 @@
 import chainlit as cl
 from chainlit.input_widget import Slider
 from llm.api.codegeex4 import codegeex4
-from prompts.base_prompt import judge_task_prompt,get_cur_base_user_prompt,web_judge_task_prompt
+from prompts.base_prompt import (
-from utils.tools import unzip_file,get_project_files_with_content
+    judge_task_prompt,
    get_cur_base_user_prompt,
    web_judge_task_prompt,
 )
 from utils.tools import unzip_file, get_project_files_with_content
 from utils.bingsearch import bing_search_prompt
 from llm.local.codegeex4 import CodegeexChatModel
-local_model_path = '<your_local_model_path>'
+
 local_model_path = "<your_local_model_path>"
 llm = CodegeexChatModel(local_model_path)
 class StreamProcessor:
    def __init__(self):
        self.previous_str = ""
    def get_new_part(self, new_str):
-        new_part = new_str[len(self.previous_str):]
+        new_part = new_str[len(self.previous_str) :]
        self.previous_str = new_str
        return new_part
@cl.set_chat_profiles
 async def chat_profile():
    return [
        cl.ChatProfile(
            name="chat聊天",
            markdown_description="聊天demo：支持多轮对话。",
-            starters = [
+            starters=[
                cl.Starter(
-                label="请你用python写一个快速排序。",
+                    label="请你用python写一个快速排序。",
-                message="请你用python写一个快速排序。",
+                    message="请你用python写一个快速排序。",
                ),
-
+                cl.Starter(
-            cl.Starter(
+                    label="请你介绍一下自己。",
-                label="请你介绍一下自己。",
+                    message="请你介绍一下自己。",
                message="请你介绍一下自己。",
                ),
-            cl.Starter(
+                cl.Starter(
-                label="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
+                    label="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
-                message="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
+                    message="用 Python 编写一个脚本来自动发送每日电子邮件报告，并指导我如何进行设置。",
                ),
-            cl.Starter(
+                cl.Starter(
-                label="我是一个python初学者，请你告诉我怎么才能学好python。",
+                    label="我是一个python初学者，请你告诉我怎么才能学好python。",
-                message="我是一个python初学者，请你告诉我怎么才能学好python。",
+                    message="我是一个python初学者，请你告诉我怎么才能学好python。",
-                
+                ),
-                )
+            ],
            ]
        ),
        cl.ChatProfile(
            name="联网问答",
-            markdown_description="联网能力dome：支持联网回答用户问题。",
+            markdown_description="联网能力demo：支持联网回答用户问题。",
        ),
        cl.ChatProfile(
            name="上传本地项目",
-            markdown_description="项目级能力dome：支持上传本地zip压缩包项目，可以进行项目问答和对项目进行修改。",
+            markdown_description="项目级能力demo：支持上传本地zip压缩包项目，可以进行项目问答和对项目进行修改。",
-            
+        ),
        )
    ]
@ -85,34 +84,32 @@ async def start():
    ).send()
    temperature = settings["temperature"]
    top_p = settings["top_p"]
-    cl.user_session.set('temperature',temperature)
+    cl.user_session.set("temperature", temperature)
-    cl.user_session.set('top_p',top_p)
+    cl.user_session.set("top_p", top_p)
-    cl.user_session.set(
+    cl.user_session.set("message_history", [])
        "message_history",
        []
    )
    chat_profile = cl.user_session.get("chat_profile")
-    extract_dir = 'repodata'
+    extract_dir = "repodata"
    if chat_profile == "chat聊天":
        pass
-    elif chat_profile =="上传本地项目":
+    elif chat_profile == "上传本地项目":
        files = None
        while files == None:
            files = await cl.AskFileMessage(
-                content="请上传项目zip压缩文件!", accept={"application/zip": [".zip"]},max_size_mb=50
+                content="请上传项目zip压缩文件!",
                accept={"application/zip": [".zip"]},
                max_size_mb=50,
            ).send()
        text_file = files[0]
-        extracted_path = unzip_file(text_file.path,extract_dir)
+        extracted_path = unzip_file(text_file.path, extract_dir)
        files_list = get_project_files_with_content(extracted_path)
-        cl.user_session.set("project_index",files_list)
+        cl.user_session.set("project_index", files_list)
-        if len(files_list)>0:
+        if len(files_list) > 0:
            await cl.Message(
                content=f"已成功上传，您可以开始对项目进行提问！",
            ).send()
@cl.on_message
 async def main(message: cl.Message):
    chat_profile = cl.user_session.get("chat_profile")
@ -121,45 +118,54 @@ async def main(message: cl.Message):
    if chat_profile == "chat聊天":
        prompt_content = get_cur_base_user_prompt(message_history=message_history)
-    elif chat_profile=="联网问答":
+    elif chat_profile == "联网问答":
-        judge_context = llm.chat(web_judge_task_prompt.format(user_input=message.content),temperature=0.2)
+        judge_context = llm.chat(
            web_judge_task_prompt.format(user_input=message.content), temperature=0.2
        )
        print(judge_context)
        message_history.pop()
-        if '是' in judge_context:
+        if "是" in judge_context:
            prompt_tmp = bing_search_prompt(message.content)
            message_history.append({"role": "user", "content": prompt_tmp})
        else:
            message_history.append({"role": "user", "content": message.content})
        prompt_content = get_cur_base_user_prompt(message_history=message_history)
-    elif chat_profile =="上传本地项目" :
+    elif chat_profile == "上传本地项目":
-        judge_context = llm.chat(judge_task_prompt.format(user_input=message.content),temperature=0.2)
+        judge_context = llm.chat(
-        
+            judge_task_prompt.format(user_input=message.content), temperature=0.2
        )
        project_index = cl.user_session.get("project_index")
        index_prompt = ""
        index_tmp = """###PATH:{path}\n{code}\n"""
        for index in project_index:
-            index_prompt+=index_tmp.format(path=index['path'],code=index['content'])
+            index_prompt += index_tmp.format(path=index["path"], code=index["content"])
        print(judge_context)
-        prompt_content = get_cur_base_user_prompt(message_history=message_history,index_prompt=index_prompt,judge_context=judge_context) if '正常' not in judge_context else get_cur_base_user_prompt(message_history=message_history)
+        prompt_content = (
-    
+            get_cur_base_user_prompt(
-    
+                message_history=message_history,
                index_prompt=index_prompt,
                judge_context=judge_context,
            )
            if "正常" not in judge_context
            else get_cur_base_user_prompt(message_history=message_history)
        )
    msg = cl.Message(content="")
    await msg.send()
    temperature = cl.user_session.get("temperature")
-    top_p = cl.user_session.get('top_p')
+    top_p = cl.user_session.get("top_p")
-    if len(prompt_content)/4<120000:
+    if len(prompt_content) / 4 < 120000:
-        stream =  llm.stream_chat(prompt_content,temperature=temperature,top_p = top_p)
+        stream = llm.stream_chat(prompt_content, temperature=temperature, top_p=top_p)
        stream_processor = StreamProcessor()
        for part in stream:
            if isinstance(part, str):
                text = stream_processor.get_new_part(part)
            elif isinstance(part, dict):
-                text = stream_processor.get_new_part(part['name']+part['content'])
+                text = stream_processor.get_new_part(part["name"] + part["content"])
            if token := (text or " "):
                await msg.stream_token(token)
    else:
--- a/repodemo/utils/bingsearch.py
+++ b/repodemo/utils/bingsearch.py
@ -2,7 +2,9 @@ import requests
 from bs4 import BeautifulSoup as BS4
 import requests
-BING_API_KEY = '<your_bing_api_key>'
+BING_API_KEY = "<your_bing_api_key>"
 def search_with_bing(query: str, search_timeout=30, top_k=6) -> list[dict]:
    """
    Search with bing and return the contexts.
@ -13,9 +15,9 @@ def search_with_bing(query: str, search_timeout=30, top_k=6) -> list[dict]:
        headers={"Ocp-Apim-Subscription-Key": BING_API_KEY},
        params={
            "q": query,
-            "responseFilter": ['webpages'],
+            "responseFilter": ["webpages"],
-            "freshness": 'month',
+            "freshness": "month",
-            "mkt": 'zh-CN'
+            "mkt": "zh-CN",
        },
        timeout=search_timeout,
    )
@ -23,25 +25,29 @@ def search_with_bing(query: str, search_timeout=30, top_k=6) -> list[dict]:
        json_content = response.json()
        # print(json_content)
        contexts = json_content["webPages"]["value"][:top_k]
-        #logger.info("Web搜索完成")
+        # logger.info("Web搜索完成")
        return contexts
    except Exception as e:
-        #logger.error(f"搜索失败，错误原因: {e}")
+        # logger.error(f"搜索失败，错误原因: {e}")
        print(f"搜索失败，错误原因: {e}")
        return []
 def fetch_url(url):
    response = requests.get(url)
-    #use beautifulsoup4 to parse html
+    # use beautifulsoup4 to parse html
-    soup = BS4(response.text, 'html.parser')
+    soup = BS4(response.text, "html.parser")
    plain_text = soup.get_text()
    return plain_text
 def bing_search_prompt(input):
    contents = search_with_bing(input, search_timeout=5, top_k=6)
    citations = "\n\n".join(
-        [f"[[citation:{i + 1}]]\n```markdown\n{item['snippet']}\n```" for i, item in enumerate(contents)]
+        [
            f"[[citation:{i + 1}]]\n```markdown\n{item['snippet']}\n```"
            for i, item in enumerate(contents)
        ]
    )
    prompt = f"[引用]\n{citations}\n问：{input}\n"
    return prompt
--- a/repodemo/utils/tools.py
+++ b/repodemo/utils/tools.py
@ -2,6 +2,7 @@ import zipfile
 import os
 import json
 def unzip_file(zip_path, extract_dir):
    """
    解压zip文件到指定目录，并在指定目录下创建一个新的目录存放解压后的文件
@ -23,7 +24,7 @@ def unzip_file(zip_path, extract_dir):
    if not os.path.exists(new_extract_dir):
        os.makedirs(new_extract_dir)
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(new_extract_dir)
    return new_extract_dir
@ -48,83 +49,85 @@ def get_project_files_with_content(project_dir):
                relative_path = os.path.relpath(file_path, project_dir)
                if "__MACOSX" in relative_path:
                    continue
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                    content = f.read()
-                files_list.append({'path': relative_path, 'content': content})
+                files_list.append({"path": relative_path, "content": content})
            else:
                continue
    return files_list
 def filter_data(obj):
    LANGUAGE_TAG = {
-    "c++"          : "// C++",
+        "c++": "// C++",
-    "cpp"          : "// C++",
+        "cpp": "// C++",
-    "c"            : "// C",
+        "c": "// C",
-    "c#"           : "// C#",
+        "c#": "// C#",
-    "c-sharp"      : "// C#",
+        "c-sharp": "// C#",
-    "css"          : "/* CSS */",
+        "css": "/* CSS */",
-    "cuda"         : "// Cuda",
+        "cuda": "// Cuda",
-    "fortran"      : "! Fortran",
+        "fortran": "! Fortran",
-    "go"           : "// Go",
+        "go": "// Go",
-    "html"         : "<!-- HTML -->",
+        "html": "<!-- HTML -->",
-    "java"         : "// Java",
+        "java": "// Java",
-    "js"           : "// JavaScript",
+        "js": "// JavaScript",
-    "javascript"   : "// JavaScript",
+        "javascript": "// JavaScript",
-    "kotlin"       : "// Kotlin",
+        "kotlin": "// Kotlin",
-    "lean"         : "-- Lean",
+        "lean": "-- Lean",
-    "lua"          : "-- Lua",
+        "lua": "-- Lua",
-    "objectivec"  : "// Objective-C",
+        "objectivec": "// Objective-C",
-    "objective-c"  : "// Objective-C",
+        "objective-c": "// Objective-C",
-    "objective-c++": "// Objective-C++",
+        "objective-c++": "// Objective-C++",
-    "pascal"       : "// Pascal",
+        "pascal": "// Pascal",
-    "php"          : "// PHP",
+        "php": "// PHP",
-    "python"       : "# Python",
+        "python": "# Python",
-    "r"            : "# R",
+        "r": "# R",
-    "rust"         : "// Rust",
+        "rust": "// Rust",
-    "ruby"         : "# Ruby",
+        "ruby": "# Ruby",
-    "scala"        : "// Scala",
+        "scala": "// Scala",
-    "shell"        : "# Shell",
+        "shell": "# Shell",
-    "sql"          : "-- SQL",
+        "sql": "-- SQL",
-    "tex"          : f"% TeX",
+        "tex": f"% TeX",
-    "typescript"   : "// TypeScript",
+        "typescript": "// TypeScript",
-    "vue"          : "<!-- Vue -->",
+        "vue": "<!-- Vue -->",
-
+        "assembly": "; Assembly",
-    "assembly"     : "; Assembly",
+        "dart": "// Dart",
-    "dart"         : "// Dart",
+        "perl": "# Perl",
-    "perl"         : "# Perl",
+        "prolog": f"% Prolog",
-    "prolog"       : f"% Prolog",
+        "swift": "// swift",
-    "swift"        : "// swift",
+        "lisp": "; Lisp",
-    "lisp"         : "; Lisp",
+        "vb": "' Visual Basic",
-    "vb"           : "' Visual Basic",
+        "visual basic": "' Visual Basic",
-    "visual basic" : "' Visual Basic",
+        "matlab": f"% Matlab",
-    "matlab"       : f"% Matlab",
+        "delphi": "{ Delphi }",
-    "delphi"       : "{ Delphi }",
+        "scheme": "; Scheme",
-    "scheme"       : "; Scheme",
+        "basic": "' Basic",
-    "basic"        : "' Basic",
+        "assembly": "; Assembly",
-    "assembly"     : "; Assembly",
+        "groovy": "// Groovy",
-    "groovy"       : "// Groovy",
+        "abap": "* Abap",
-    "abap"         : "* Abap",
+        "gdscript": "# GDScript",
-    "gdscript"     : "# GDScript",
+        "haskell": "-- Haskell",
-    "haskell"      : "-- Haskell",
+        "julia": "# Julia",
-    "julia"        : "# Julia",
+        "elixir": "# Elixir",
-    "elixir"       : "# Elixir",
+        "excel": "' Excel",
-    "excel"        : "' Excel",
+        "clojure": "; Clojure",
-    "clojure"      : "; Clojure",
+        "actionscript": "// ActionScript",
-    "actionscript" : "// ActionScript",
+        "solidity": "// Solidity",
-    "solidity"     : "// Solidity",
+        "powershell": "# PowerShell",
-    "powershell"   : "# PowerShell",
+        "erlang": f"% Erlang",
-    "erlang"       : f"% Erlang",
+        "cobol": "// Cobol",
-    "cobol"        : "// Cobol",
+        "batchfile": ":: Batch file",
-    "batchfile"  : ":: Batch file",
+        "makefile": "# Makefile",
-    "makefile"     : "# Makefile",
+        "dockerfile": "# Dockerfile",
-    "dockerfile"   : "# Dockerfile",
+        "markdown": "<!-- Markdown -->",
-    "markdown"     : "<!-- Markdown -->",
+        "cmake": "# CMake",
-    "cmake"        : "# CMake",
+        "dockerfile": "# Dockerfile",
    "dockerfile"   : "# Dockerfile",
    }
-    programming_languages_to_file_extensions = json.load(open('utils/programming-languages-to-file-extensions.json'))
+    programming_languages_to_file_extensions = json.load(
        open("utils/programming-languages-to-file-extensions.json")
    )
    need2del = []
    for key in programming_languages_to_file_extensions.keys():
        if key.lower() not in LANGUAGE_TAG:
@ -140,15 +143,14 @@ def filter_data(obj):
            ext_to_programming_languages[item] = key
            want_languages.append(item)
-    ext = '.'+obj.split('.')[-1]
+    ext = "." + obj.split(".")[-1]
-    with open('utils/keep.txt', 'r') as f:
+    with open("utils/keep.txt", "r") as f:
        keep_files = f.readlines()
        keep_files = [l.strip() for l in keep_files]
-    #print(ext)
+    # print(ext)
    if ext not in want_languages:
        if obj in keep_files:
            return True
        return False
    else:
        return True