Add additional filetypes
Some checks failed
Pytest / Explore-Gitea-Actions (push) Failing after -14m24s

This commit is contained in:
Alex Selimov 2025-03-22 01:08:38 -04:00
parent 3e7f4e42f7
commit 550e4b78e2

View File

@ -74,18 +74,7 @@ class RAG:
""" """
# Set default extensions for common programming languages if none provided # Set default extensions for common programming languages if none provided
if extensions is None: if extensions is None:
extensions = [ extensions = default_file_extensions()
# Python
'.py', '.pyi', '.pyx',
# C/C++
'.c', '.cpp', '.cc', '.cxx', '.h', '.hpp', '.hxx',
# Rust
'.rs',
# Documentation
'.txt', '.md',
# Build/Config
'.toml', '.yaml', '.json'
]
elif isinstance(extensions, str): elif isinstance(extensions, str):
extensions = [extensions] extensions = [extensions]
@ -103,14 +92,12 @@ class RAG:
if os.path.exists(self.db_dir) and not force_refresh: if os.path.exists(self.db_dir) and not force_refresh:
print("Loading existing vector store") print("Loading existing vector store")
vectorstore = Chroma( vectorstore = Chroma(
persist_directory=self.db_dir, persist_directory=self.db_dir, embedding_function=embeddings
embedding_function=embeddings
) )
else: else:
print("Creating new vector store") print("Creating new vector store")
vectorstore = Chroma( vectorstore = Chroma(
persist_directory=self.db_dir, persist_directory=self.db_dir, embedding_function=embeddings
embedding_function=embeddings
) )
# Find all files that match the extensions # Find all files that match the extensions
@ -127,14 +114,14 @@ class RAG:
all_documents = [] all_documents = []
for file_path in all_files: for file_path in all_files:
try: try:
with open(file_path, 'r', encoding='utf-8') as f: with open(file_path, "r", encoding="utf-8") as f:
content = f.read() content = f.read()
doc = Document( doc = Document(
page_content=content, page_content=content,
metadata={ metadata={
"source": os.path.abspath(file_path), "source": os.path.abspath(file_path),
"source_id": os.path.abspath(file_path) "source_id": os.path.abspath(file_path),
} },
) )
all_documents.append(doc) all_documents.append(doc)
print(f"Successfully loaded {file_path}") print(f"Successfully loaded {file_path}")
@ -151,9 +138,7 @@ class RAG:
if force_refresh: if force_refresh:
# Create new vector store from scratch # Create new vector store from scratch
vectorstore = Chroma.from_documents( vectorstore = Chroma.from_documents(
documents=chunks, documents=chunks, embedding=embeddings, persist_directory=self.db_dir
embedding=embeddings,
persist_directory=self.db_dir
) )
else: else:
# Update existing vector store # Update existing vector store
@ -163,7 +148,8 @@ class RAG:
# Find new documents # Find new documents
new_chunks = [ new_chunks = [
chunk for chunk in chunks chunk
for chunk in chunks
if chunk.metadata["source"] not in existing_sources if chunk.metadata["source"] not in existing_sources
] ]
@ -224,3 +210,136 @@ class RAG:
""" """
response = rag_chain.invoke({"query": query}) response = rag_chain.invoke({"query": query})
return response["result"] return response["result"]
def default_file_extensions():
"""Return the default file extensions representing common plain text and code files"""
return [
# Python
".py",
".pyi",
".pyx",
".pyc",
".pyd",
".pyw",
# C/C++
".c",
".cpp",
".cc",
".cxx",
".h",
".hpp",
".hxx",
".inc",
".inl",
".ipp",
# Rust
".rs",
".rlib",
".rmeta",
# Java
".java",
".jsp",
".jav",
".jar",
".class",
".kt",
".kts",
".groovy",
# Web
".html",
".htm",
".css",
".scss",
".sass",
".less",
".js",
".jsx",
".ts",
".tsx",
".vue",
".svelte",
# Fortran
".f",
".for",
".f90",
".f95",
".f03",
".f08",
# Go
".go",
".mod",
# Ruby
".rb",
".rbw",
".rake",
".gemspec",
# PHP
".php",
".phtml",
".php3",
".php4",
".php5",
".phps",
# C#
".cs",
".csx",
".vb",
# Swift
".swift",
".swiftmodule",
# Shell/Scripts
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
".cmd",
# Scala
".scala",
".sc",
# Haskell
".hs",
".lhs",
".hsc",
# Lua
".lua",
".luac",
# R
".r",
".rmd",
".rds",
# Perl
".pl",
".pm",
".t",
# Documentation
".txt",
".md",
".rst",
".adoc",
".wiki",
# Build/Config
".toml",
".yaml",
".yml",
".json",
".xml",
".ini",
".conf",
".cfg",
# SQL
".sql",
".mysql",
".pgsql",
".sqlite",
# Lisp family
".lisp",
".cl",
".el",
".clj",
".cljc",
".cljs",
".edn",
]