mirror of
https://github.com/JasonYANG170/CodeGeeX4.git
synced 2024-11-23 20:26:29 +00:00
69 lines
1.9 KiB
Python
69 lines
1.9 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
from llama_index.core.node_parser import CodeSplitter
|
|
from llama_index.core.schema import BaseNode
|
|
from llama_index.readers.file import FlatReader
|
|
|
|
Languages = {
|
|
'c': "c",
|
|
'cpp': "cpp",
|
|
'go': "go",
|
|
'java': "java",
|
|
'js': "javascript",
|
|
'md': "markdown",
|
|
'py': "python",
|
|
'ts': "typescript",
|
|
}
|
|
|
|
|
|
def traverse(repo_path: str) -> list[str]:
|
|
"""
|
|
Traverse the directory, fetch all files
|
|
- skip hidden directories
|
|
- only keep the supported files
|
|
:param repo_path: path to this repo
|
|
"""
|
|
|
|
def helper(root):
|
|
for entry in os.scandir(root):
|
|
if entry.name.startswith('.'):
|
|
continue
|
|
if entry.is_file():
|
|
ext = entry.name.split('.')[-1].lower()
|
|
if ext not in Languages.keys():
|
|
continue
|
|
file_paths.append(entry.path)
|
|
elif entry.is_dir():
|
|
helper(entry.path)
|
|
|
|
file_paths = []
|
|
helper(repo_path)
|
|
return sorted(file_paths)
|
|
|
|
|
|
def split_into_chunks(file_path, lines_per_chunk, lines_overlap, max_chars) -> list[BaseNode]:
|
|
"""
|
|
Split file into chunks
|
|
:param file_path: path to the file
|
|
:param lines_per_chunk: lines for each chunk
|
|
:param lines_overlap: overlap lines between 2 chunks
|
|
:param max_chars: max characters for each chunk
|
|
"""
|
|
ext = file_path.split('.')[-1].lower()
|
|
lang = Languages.get(ext, None)
|
|
if not lang:
|
|
return []
|
|
try:
|
|
documents = FlatReader().load_data(Path(file_path))
|
|
splitter = CodeSplitter(
|
|
language=lang,
|
|
chunk_lines=lines_per_chunk,
|
|
chunk_lines_overlap=lines_overlap,
|
|
max_chars=max_chars,
|
|
)
|
|
return splitter.get_nodes_from_documents(documents)
|
|
except Exception as e:
|
|
print(f'`{file_path}`切分失败: {e}')
|
|
return []
|