Initial commit

2025-03-21 10:09:07 -04:00 · 2025-03-21 10:09:07 -04:00 · e0d0962fc5
commit e0d0962fc5
20 changed files with 5731 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,6 @@
+# code_rag
+
+This is intended to be a fully local Retrieval Augmented Generation tool for software development. 
+This will be loadable as a neovim plugin, where we retrieve the current workspace files and add them to a persistent chroma vector store.
+This tool will expose an Ollama API to query locally running Ollama models with prompts augmented by relevant context from the current code.
+
--- a/document_tracker.json
+++ b/document_tracker.json
@ -0,0 +1,97 @@
+{
+  "/tmp/tmpje6soo0x/documents/python_info.txt": {
+    "chunk_ids": [
+      "c0b04224-f72e-4f6e-bef1-067f1cf5c716"
+    ]
+  },
+  "/tmp/tmpy2siaefv/documents/python_info.txt": {
+    "chunk_ids": [
+      "27dc4179-c908-47dd-8e56-2e2b3c403faf"
+    ]
+  },
+  "/tmp/tmp_fjaagdf/documents/python_info.txt": {
+    "chunk_ids": [
+      "3c147f7f-bd5a-456b-8bee-f9a43b6721c9"
+    ]
+  },
+  "/tmp/tmpfbgc8zgm/documents/python_info.txt": {
+    "chunk_ids": [
+      "34eb0065-8a18-4144-890c-2e099d334b88"
+    ]
+  },
+  "/tmp/tmp8u9_vv_c/documents/python_info.txt": {
+    "chunk_ids": [
+      "6cd7655c-8fa2-437c-b8fc-5690c928b89e"
+    ]
+  },
+  "/tmp/tmpm_quzgav/documents/python_info.txt": {
+    "chunk_ids": [
+      "9e452f02-fe93-4b65-b883-f585df25dd4d"
+    ]
+  },
+  "/tmp/tmpjrncrqpe/documents/doc1.txt": {
+    "chunk_ids": [
+      "81cea007-7a54-4d19-869d-3643fd8cf257"
+    ]
+  },
+  "/tmp/tmpjrncrqpe/documents/doc2.txt": {
+    "chunk_ids": [
+      "eb1bd307-e065-49a9-8d73-007403d13bd0"
+    ]
+  },
+  "/tmp/tmpjrncrqpe/documents/doc3.txt": {
+    "chunk_ids": [
+      "3adccb38-28df-496d-b5a1-fa4555ea1fb5"
+    ]
+  },
+  "/tmp/tmptyee73q5/documents/doc1.txt": {
+    "chunk_ids": [
+      "319dc9c4-0a96-4c49-9306-c9c11e94b613"
+    ]
+  },
+  "/tmp/tmptyee73q5/documents/doc2.txt": {
+    "chunk_ids": [
+      "d20230ce-0753-4303-a9a2-d14379bb3d91"
+    ]
+  },
+  "/tmp/tmptyee73q5/documents/doc3.txt": {
+    "chunk_ids": [
+      "ceb9e3c7-c84f-49b4-8da4-ec96ac39f7e5"
+    ]
+  },
+  "/tmp/tmpq7xsv0n6/documents/doc1.txt": {
+    "chunk_ids": [
+      "2a7d43bd-3f25-46b2-8f9c-b5108afa06df"
+    ]
+  },
+  "/tmp/tmpq7xsv0n6/documents/doc2.txt": {
+    "chunk_ids": [
+      "6e38e121-14a6-4c07-9717-b7babb0fbb2f"
+    ]
+  },
+  "/tmp/tmpq7xsv0n6/documents/doc3.txt": {
+    "chunk_ids": [
+      "cbc80fab-f4bc-47c4-a482-36afab9a649e"
+    ]
+  },
+  "/tmp/tmpi80m0y6n/documents/doc1.txt": {
+    "chunk_ids": [
+      "b145e673-0a4c-4058-a6d1-e1237bd2d9af"
+    ]
+  },
+  "/tmp/tmpi80m0y6n/documents/doc2.txt": {
+    "chunk_ids": [
+      "32082799-0089-4a3d-b4ad-ea3ff0084212"
+    ]
+  },
+  "/tmp/tmpi80m0y6n/documents/doc3.txt": {
+    "chunk_ids": [
+      "0902fef4-ce95-4a73-9e0c-90c1581f80b2"
+    ]
+  },
+  "/tmp/tmpi80m0y6n/documents/python_info.txt": {
+    "chunk_ids": [
+      "009da108-76ba-4ef7-9c6b-f862f577ce23"
+    ]
+  }
+}
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,19 @@
+[project]
+name = "code-rag"
+version = "0.1.0"
+description = "Simple RAG implementation for use with neovim"
+authors = [{ name = "Alex Selimov", email = "alex@alexselimov.com" }]
+readme = "README.md"
+requires-python = ">=3.9,<4.0"
+dependencies = ["langchain (>=0.3.21,<0.4.0)", "ollama (>=0.4.7,<0.5.0)", "langchain-community (>=0.3.20,<0.4.0)", "langchain-ollama (>=0.2.3,<0.3.0)", "chromadb (>=0.6.3,<0.7.0)", "unstructured (>=0.17.2,<0.18.0)"]
+
+[tool.poetry]
+packages = [{ include = "code_rag", from = "src" }]
+
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"
+
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/src/code_rag/init.py
+++ b/src/code_rag/init.py
--- a/src/code_rag/pycache/init.cpython-313.pyc
+++ b/src/code_rag/pycache/init.cpython-313.pyc
--- a/src/code_rag/pycache/doc_tracker.cpython-313.pyc
+++ b/src/code_rag/pycache/doc_tracker.cpython-313.pyc
--- a/src/code_rag/pycache/rag.cpython-313.pyc
+++ b/src/code_rag/pycache/rag.cpython-313.pyc
--- a/src/code_rag/doc_tracker.py
+++ b/src/code_rag/doc_tracker.py
@ -0,0 +1,174 @@
+import os
+import hashlib
+import json
+from datetime import datetime
+import time
+
+
+class DocMetaData:
+    """
+    Class that stores document meta data. Using this so we know that the MetaData is always
+    initialized and can avoid a lot of checking whether keys exist or not
+    """
+
+    def __init__(self, mod_time, hash, last_updated, chunk_ids):
+        self.mod_time = mod_time
+        self.hash = hash
+        self.last_updated = last_updated
+        self.chunk_ids = chunk_ids
+
+    def __eq__(self, other):
+        if isinstance(other, DocMetaData):
+            return (
+                self.mod_time == other.mod_time
+                and self.hash == other.hash
+                and self.last_updated == other.last_updated
+                and self.chunk_ids == other.chunk_ids
+            )
+        return NotImplemented
+
+    def update_chunks(self, chunks):
+        self.chunk_ids = chunks
+
+    def to_dict(self):
+        return {
+            "mod_time": self.mod_time,
+            "hash": self.hash,
+            "last_updated": self.last_updated,
+            "chunk_ids": self.chunk_ids,
+        }
+
+    @classmethod
+    def from_file(cls, file_path):
+        return cls(
+            os.path.getmtime(file_path), calculate_file_hash(file_path), time.time(), []
+        )
+
+    @classmethod
+    def from_dict(cls, input_dict):
+        return cls(
+            input_dict["mod_time"],
+            input_dict["hash"],
+            input_dict["last_updated"],
+            input_dict["chunk_ids"],
+        )
+
+
+class DocumentTracker:
+    """
+    Tracks document changes using file hashes, modification times, and chunk IDs
+    """
+
+    def __init__(self, tracking_file):
+        self.tracking_file = tracking_file
+        self.doc_info = self._load_tracking_data()
+
+    def _load_tracking_data(self):
+        """Load existing tracking data if available"""
+        doc_info = dict()
+        if os.path.exists(self.tracking_file):
+            with open(self.tracking_file, "r") as f:
+                serialized = json.load(f)
+                for k, v in serialized.items():
+                    doc_info[k] = DocMetaData.from_dict(v)
+
+        return doc_info
+
+    def _save_tracking_data(self):
+        """Save tracking data to file"""
+        output = dict()
+        for k, v in self.doc_info.items():
+            output[k] = v.to_dict()
+        with open(self.tracking_file, "w") as f:
+            json.dump(output, f, indent=2)
+
+    def get_changed_files(self, directory, file_extension=".txt"):
+        """
+        Detect new, modified, and deleted files
+        Returns: dict with 'new', 'modified', and 'deleted' lists
+        """
+        current_file_mod_times = {}
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if file.endswith(file_extension):
+                    file_path = os.path.join(root, file)
+                    mod_time = os.path.getmtime(file_path)
+                    current_file_mod_times[file_path] = mod_time
+
+        new_files = []
+        modified_files = []
+
+        # Check for new or modified files
+        for file_path, mod_time in current_file_mod_times.items():
+            if file_path not in self.doc_info:
+                new_files.append(file_path)
+            elif mod_time > self.doc_info[file_path].mod_time:
+                # Check if content actually changed using hash
+                current_hash = calculate_file_hash(file_path)
+                if current_hash != self.doc_info[file_path].hash:
+                    modified_files.append(file_path)
+                    self.doc_info[file_path].hash = current_hash
+                self.doc_info[file_path].mod_time = mod_time
+
+        # Check for deleted files
+        deleted_files = [f for f in self.doc_info if f not in current_file_mod_times]
+
+        # Update tracking information for new files
+        for file_path in new_files:
+            self.doc_info[file_path] = DocMetaData(
+                current_file_mod_times[file_path],
+                calculate_file_hash(file_path),
+                time.time(),
+                [],  # Will store IDs of chunks derived from this document
+            )
+
+        # Update last_updated for modified files
+        for file_path in modified_files:
+            self.doc_info[file_path].last_updated = datetime.now().isoformat()
+
+        self._save_tracking_data()
+
+        return {"new": new_files, "modified": modified_files, "deleted": deleted_files}
+
+    def update_chunk_mappings(self, file_path, chunk_ids):
+        """Store the chunk IDs associated with a document"""
+        if file_path not in self.doc_info:
+            self.doc_info[file_path] = DocMetaData.from_file(file_path)
+
+        self.doc_info[file_path].chunk_ids = chunk_ids
+        self._save_tracking_data()
+
+    def get_chunks_to_delete(self, deleted_files):
+        """Get all chunk IDs associated with deleted files"""
+        chunks_to_delete = []
+        for file_path in deleted_files:
+            if file_path in self.doc_info and self.doc_info[file_path].chunk_ids:
+                chunks_to_delete.extend(self.doc_info[file_path].chunk_ids)
+                # Remove the file from tracking after processing
+                del self.doc_info[file_path]
+
+        self._save_tracking_data()
+        return chunks_to_delete
+
+    def get_chunks_for_modified_files(self, modified_files):
+        """Get chunk IDs for modified files that need to be deleted before re-indexing"""
+        chunks_to_delete = []
+        for file_path in modified_files:
+            if file_path in self.doc_info and self.doc_info[file_path].chunk_ids:
+                chunks_to_delete.extend(self.doc_info[file_path].chunk_ids)
+                # Clear the chunk IDs (will be updated with new ones)
+                self.doc_info[file_path].chunk_ids = []
+
+        self._save_tracking_data()
+        return chunks_to_delete
+
+
+def calculate_file_hash(file_path):
+    """Calculate MD5 hash of file contents"""
+    hasher = hashlib.md5()
+    with open(file_path, "rb") as f:
+        buf = f.read(65536)
+        while len(buf) > 0:
+            hasher.update(buf)
+            buf = f.read(65536)
+    return hasher.hexdigest()
--- a/src/code_rag/rag.py
+++ b/src/code_rag/rag.py
@ -0,0 +1,228 @@
+import os
+import uuid
+from langchain_community.document_loaders import DirectoryLoader, TextLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_ollama import OllamaEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.llms import Ollama
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+
+from code_rag.doc_tracker import DocumentTracker
+
+
+class RAG:
+    def __init__(self, docs_dir, db_dir, tracker_file):
+        self.docs_dir = docs_dir
+        self.db_dir = db_dir
+        self.tracker = DocumentTracker(tracker_file)
+
+    def process_documents(self, files, text_splitter):
+        """Process document files into chunks with tracking metadata"""
+        all_chunks = []
+        file_chunk_map = {}
+
+        for file_path in files:
+            # Load the document
+            loader = TextLoader(file_path)
+            documents = loader.load()
+
+            # Add source metadata
+            for doc in documents:
+                doc.metadata["source"] = file_path
+                doc.metadata["source_id"] = file_path  # For easier identification
+
+            # Split the document
+            chunks = text_splitter.split_documents(documents)
+
+            # Generate and track chunk IDs
+            chunk_ids = []
+            for chunk in chunks:
+                chunk_id = str(uuid.uuid4())
+                chunk.metadata["chunk_id"] = chunk_id
+                chunk_ids.append(chunk_id)
+
+            # Store chunk mappings if tracker is provided
+            if self.tracker:
+                self.tracker.update_chunk_mappings(file_path, chunk_ids)
+
+            file_chunk_map[file_path] = chunks
+            all_chunks.extend(chunks)
+
+        return all_chunks, file_chunk_map
+
+    def create_vector_db(self, extension=".txt", force_refresh=False):
+        """
+        Create or update a vector database, with complete handling of changes
+        """
+        # Text splitter configuration
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000, chunk_overlap=200
+        )
+
+        # Create embeddings
+        embeddings = OllamaEmbeddings(model="nomic-embed-text")
+
+        if force_refresh:
+            print("Force refresh: Processing all documents")
+            # Load all documents
+            loader = DirectoryLoader(self.docs_dir, glob=f"**/*{extension}")
+            all_documents = loader.load()
+
+            # Add unique IDs to each document
+            for doc in all_documents:
+                doc.metadata["source"] = os.path.abspath(doc.metadata["source"])
+                doc.metadata["source_id"] = doc.metadata["source"]
+
+            # Split documents
+            chunks = text_splitter.split_documents(all_documents)
+
+            # Add chunk IDs and update tracker
+            file_chunk_map = {}
+            for chunk in chunks:
+                chunk_id = str(uuid.uuid4())
+                chunk.metadata["chunk_id"] = chunk_id
+
+                source = chunk.metadata["source"]
+                if source not in file_chunk_map:
+                    file_chunk_map[source] = []
+
+                file_chunk_map[source].append(chunk_id)
+
+            # Update tracker with chunk mappings
+            for file_path, chunk_ids in file_chunk_map.items():
+                self.tracker.update_chunk_mappings(file_path, chunk_ids)
+
+            print(
+                f"Processing {len(all_documents)} documents with {len(chunks)} chunks"
+            )
+
+            # Create new vector store
+            vectorstore = Chroma.from_documents(
+                documents=chunks, embedding=embeddings, persist_directory=self.db_dir
+            )
+            vectorstore.persist()
+            print(f"Created new vector database at {self.db_dir}")
+
+            return vectorstore
+
+        # Get changes since last update
+        changed_files = self.tracker.get_changed_files(self.docs_dir)
+
+        if not any(changed_files.values()):
+            print("No document changes detected")
+            # Load existing vector store if available
+            if os.path.exists(self.db_dir):
+                return Chroma(
+                    persist_directory=self.db_dir, embedding_function=embeddings
+                )
+            else:
+                print("No vector database exists. Creating from all documents...")
+                return self.create_vector_db(force_refresh=True)
+
+        # Process changes
+        print(
+            f"Changes detected - New: {len(changed_files['new'])}, Modified: {len(changed_files['modified'])}, Deleted: {len(changed_files['deleted'])}"
+        )
+
+        # Load existing vector store if it exists
+        if os.path.exists(self.db_dir):
+            vectorstore = Chroma(
+                persist_directory=self.db_dir, embedding_function=embeddings
+            )
+
+            # 1. Handle deleted documents
+            if changed_files["deleted"]:
+                chunks_to_delete = self.tracker.get_chunks_to_delete(
+                    changed_files["deleted"]
+                )
+                if chunks_to_delete:
+                    print(
+                        f"Removing {len(chunks_to_delete)} chunks from deleted documents"
+                    )
+                    # Delete the chunks from vector store
+                    vectorstore._collection.delete(
+                        where={"chunk_id": {"$in": chunks_to_delete}}
+                    )
+
+            # 2. Handle modified documents (delete old chunks first)
+            chunks_to_delete_modified = self.tracker.get_chunks_for_modified_files(
+                changed_files["modified"]
+            )
+            if chunks_to_delete_modified:
+                print(
+                    f"Removing {len(chunks_to_delete_modified)} chunks from modified documents"
+                )
+                vectorstore._collection.delete(
+                    where={"chunk_id": {"$in": chunks_to_delete_modified}}
+                )
+
+            # 3. Process new and modified documents
+            files_to_process = changed_files["new"] + changed_files["modified"]
+            if files_to_process:
+                chunks, _ = self.process_documents(
+                    files_to_process, text_splitter, self.tracker
+                )
+                print(f"Adding {len(chunks)} new chunks to the vector store")
+                vectorstore.add_documents(chunks)
+        else:
+            # If no existing DB, create from all documents
+            print("No existing vector database. Creating from all documents...")
+            return self.create_vector_db(force_refresh=True)
+
+        # Persist changes
+        vectorstore.persist()
+        print(f"Vector database updated at {self.db_dir}")
+
+        return vectorstore
+
+    def setup_rag(self, model_name="llama3"):
+        """
+        Set up the RAG system with an existing vector database
+        """
+        # Load the embeddings
+        embeddings = OllamaEmbeddings(model="nomic-embed-text")
+
+        # Load the vector store
+        vectorstore = Chroma(
+            persist_directory=self.db_dir, embedding_function=embeddings
+        )
+
+        # Create a retriever
+        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
+
+        # Set up the LLM
+        llm = Ollama(model=model_name)
+
+        # Create a custom prompt template
+        template = """
+        Answer the question based on the context provided. If you don't know the answer,
+        just say you don't know. Don't try to make up an answer.
+
+        Context: {context}
+
+        Question: {question}
+
+        Answer:
+        """
+
+        prompt = PromptTemplate(
+            input_variables=["context", "question"], template=template
+        )
+
+        # Create the RAG chain
+        rag_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=retriever,
+            chain_type_kwargs={"prompt": prompt},
+        )
+
+        return rag_chain
+
+    def query_rag(self, rag_chain, query):
+        """
+        Query the RAG system
+        """
+        response = rag_chain.invoke({"query": query})
+        return response["result"]
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/pycache/init.cpython-313.pyc
+++ b/tests/pycache/init.cpython-313.pyc
--- a/tests/pycache/fixtures.cpython-313.pyc
+++ b/tests/pycache/fixtures.cpython-313.pyc
--- a/tests/pycache/test_doc_tracker.cpython-313-pytest-8.3.5.pyc
+++ b/tests/pycache/test_doc_tracker.cpython-313-pytest-8.3.5.pyc
--- a/tests/pycache/test_rag.cpython-313-pytest-8.3.5.pyc
+++ b/tests/pycache/test_rag.cpython-313-pytest-8.3.5.pyc
--- a/tests/pycache/utility.cpython-313.pyc
+++ b/tests/pycache/utility.cpython-313.pyc
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@ -0,0 +1,66 @@
+import pytest
+import shutil
+import tempfile
+import os
+
+
+@pytest.fixture
+def temp_dir():
+    """Create a temporary directory for test files"""
+    temp_dir = tempfile.mkdtemp()
+    yield temp_dir
+    shutil.rmtree(temp_dir)
+
+
+@pytest.fixture
+def docs_dir(temp_dir):
+    """Create a temporary documents directory"""
+    docs_dir = os.path.join(temp_dir, "documents")
+    os.makedirs(docs_dir)
+    yield docs_dir
+
+
+@pytest.fixture
+def db_dir(temp_dir):
+    """Create a temporary vector database directory"""
+    db_dir = os.path.join(temp_dir, "vector_db")
+    os.makedirs(db_dir)
+    yield db_dir
+
+
+@pytest.fixture
+def tracker_file(temp_dir):
+    """Create a temporary tracker file"""
+    tracker_path = os.path.join(temp_dir, "test_tracker.json")
+    yield tracker_path
+    # Clean up after tests
+    if os.path.exists(tracker_path):
+        os.remove(tracker_path)
+
+
+@pytest.fixture
+def sample_docs(docs_dir):
+    """Create sample text documents for testing"""
+    # Create a few sample documents
+    doc1_path = os.path.join(docs_dir, "doc1.txt")
+    doc2_path = os.path.join(docs_dir, "doc2.txt")
+    doc3_path = os.path.join(docs_dir, "doc3.txt")
+
+    with open(doc1_path, "w") as f:
+        f.write("This is a sample document about artificial intelligence. " * 10)
+
+    with open(doc2_path, "w") as f:
+        f.write("This document discusses machine learning concepts. " * 10)
+
+    with open(doc3_path, "w") as f:
+        f.write("Natural language processing is a field of AI. " * 10)
+
+    return [doc1_path, doc2_path, doc3_path]
+
+
+@pytest.fixture
+def text_splitter():
+    """Create a mock text splitter"""
+    from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+    return RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
--- a/tests/test_doc_tracker.py
+++ b/tests/test_doc_tracker.py
@ -0,0 +1,157 @@
+import os
+import time
+import pytest
+
+from .fixtures import *
+from code_rag.doc_tracker import DocMetaData, DocumentTracker, calculate_file_hash
+
+
+def doc_infos_are_equal(left, right):
+    """Check to see if two doc_infos are the same"""
+    for k, v in left.items():
+        try:
+            print(v.to_dict(), right[k].to_dict(), v.to_dict() == right[k].to_dict())
+            if v != right[k]:
+                return False
+        except KeyError:
+            return False
+    return True
+
+
+@pytest.fixture
+def document_tracker(tracker_file):
+    """Create a DocumentTracker instance"""
+    return DocumentTracker(tracking_file=tracker_file)
+
+
+# Tests for DocumentTracker
+def test_init_new_tracker(tracker_file):
+    """Test creating a new tracker"""
+    tracker = DocumentTracker(tracking_file=tracker_file)
+    assert tracker.doc_info == {}
+    assert not os.path.exists(tracker_file)
+
+
+def test_save_and_load_tracking_data(document_tracker, tracker_file):
+    """Test saving and loading tracking data"""
+    # Add some data
+    update_time = time.time()
+    document_tracker.doc_info = {
+        "test.txt": DocMetaData(123456, "abcdef", update_time, ["1", "2"])
+    }
+    document_tracker._save_tracking_data()
+
+    # Check file exists
+    assert os.path.exists(tracker_file)
+
+    # Create a new tracker that should load the data
+    new_tracker = DocumentTracker(tracking_file=tracker_file)
+    assert doc_infos_are_equal(
+        new_tracker.doc_info,
+        {"test.txt": DocMetaData(123456, "abcdef", update_time, ["1", "2"])},
+    )
+
+
+def test_calculate_file_hash(document_tracker, sample_docs):
+    """Test hash calculation for a file"""
+    file_path = sample_docs[0]
+    hash1 = calculate_file_hash(file_path)
+
+    # Same content should yield same hash
+    hash2 = calculate_file_hash(file_path)
+    assert hash1 == hash2
+
+    # Different content should yield different hash
+    with open(file_path, "a") as f:
+        f.write("Additional content")
+
+    hash3 = calculate_file_hash(file_path)
+    assert hash1 != hash3
+
+
+def test_get_changed_files_new(document_tracker, docs_dir, sample_docs):
+    """Test detecting new files"""
+    changes = document_tracker.get_changed_files(docs_dir)
+    assert set(changes["new"]) == set(sample_docs)
+    assert changes["modified"] == []
+    assert changes["deleted"] == []
+
+    # Verify tracking was updated
+    for file_path in sample_docs:
+        assert file_path in document_tracker.doc_info
+        assert document_tracker.doc_info[file_path].chunk_ids == []
+
+
+def test_get_changed_files_modified(document_tracker, docs_dir, sample_docs):
+    """Test detecting modified files"""
+    # First scan to establish tracking
+    document_tracker.get_changed_files(docs_dir)
+
+    # Modify a file and wait to ensure timestamp difference
+    time.sleep(0.1)
+    with open(sample_docs[0], "a") as f:
+        f.write("Modified content")
+
+    # Detect changes
+    changes = document_tracker.get_changed_files(docs_dir)
+    assert changes["new"] == []
+    assert changes["modified"] == [sample_docs[0]]
+    assert changes["deleted"] == []
+
+
+def test_get_changed_files_deleted(document_tracker, docs_dir, sample_docs):
+    """Test detecting deleted files"""
+    # First scan to establish tracking
+    document_tracker.get_changed_files(docs_dir)
+
+    # Delete a file
+    os.remove(sample_docs[0])
+
+    # Detect changes
+    changes = document_tracker.get_changed_files(docs_dir)
+    assert changes["new"] == []
+    assert changes["modified"] == []
+    assert changes["deleted"] == [sample_docs[0]]
+
+
+def test_update_chunk_mappings(document_tracker, sample_docs):
+    """Test updating chunk mappings"""
+    file_path = sample_docs[0]
+    chunk_ids = ["chunk1", "chunk2", "chunk3"]
+
+    # First make sure the file is tracked
+    document_tracker.doc_info[file_path] = DocMetaData(
+        123,
+        "abc",
+        "2023-01-01",
+        [],
+    )
+
+    # Update chunk mappings
+    document_tracker.update_chunk_mappings(file_path, chunk_ids)
+    assert document_tracker.doc_info[file_path].chunk_ids == chunk_ids
+
+
+def test_get_chunks_to_delete(document_tracker):
+    """Test getting chunks to delete for deleted files"""
+    # Setup tracking data
+    document_tracker.doc_info = {
+        "file1.txt": DocMetaData(0, "abc", 0, ["chunk1", "chunk2"]),
+        "file2.txt": DocMetaData(0, "abc", 0, ["chunk3", "chunk4"]),
+        "file3.txt": DocMetaData(0, "abc", 0, ["chunk5"]),
+    }
+
+    # Test with one deleted file
+    chunks = document_tracker.get_chunks_to_delete(["file1.txt"])
+    assert set(chunks) == {"chunk1", "chunk2"}
+
+    # Verify file was removed from tracking
+    assert "file1.txt" not in document_tracker.doc_info
+
+    # Test with multiple deleted files
+    chunks = document_tracker.get_chunks_to_delete(["file2.txt", "file3.txt"])
+    assert set(chunks) == {"chunk3", "chunk4", "chunk5"}
+
+    # Verify files were removed from tracking
+    assert "file2.txt" not in document_tracker.doc_info
+    assert "file3.txt" not in document_tracker.doc_info
--- a/tests/test_rag.py
+++ b/tests/test_rag.py
@ -0,0 +1,124 @@
+import os
+import pytest
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_ollama import OllamaEmbeddings
+from langchain_community.vectorstores import Chroma
+from .fixtures import *
+from .utility import *
+from code_rag.rag import RAG
+from code_rag.doc_tracker import DocumentTracker
+
+
+@pytest.fixture
+def rag_pipeline(docs_dir, db_dir, tracker_file):
+    """Create a RAG instance"""
+    return RAG(docs_dir, db_dir, tracker_file)
+
+
+# Tests for document processing
+def test_process_documents(tracker_file, docs_dir, db_dir, sample_docs, rag_pipeline):
+    """Test processing documents into chunks with tracking"""
+    rag_pipeline = RAG(docs_dir, db_dir, tracker_file)
+    files = [
+        os.path.join(rag_pipeline.docs_dir, "doc1.txt"),
+        os.path.join(rag_pipeline.docs_dir, "doc2.txt"),
+    ]
+
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+
+    chunks, file_chunk_map = rag_pipeline.process_documents(files, text_splitter)
+
+    # Verify chunks were created
+    assert len(chunks) >= 2  # At least one chunk per document
+    tracker = rag_pipeline.tracker
+    # Verify chunk IDs were tracked
+    for file_path in files:
+        assert file_path in tracker.doc_info
+        assert "chunk_ids" in tracker.doc_info[file_path]
+        assert len(tracker.doc_info[file_path]["chunk_ids"]) > 0
+
+    # Verify metadata in chunks
+    for chunk in chunks:
+        assert "source" in chunk.metadata
+        assert "chunk_id" in chunk.metadata
+        assert chunk.metadata["source"] in files
+
+
+@pytest.mark.skipif(
+    not shutil.which("ollama"), reason="Ollama not installed or not in PATH"
+)
+def test_create_vector_db(docs_dir, db_dir, tracker_file, sample_docs):
+    """Test creating a vector database"""
+    rag_pipeline = RAG(docs_dir, db_dir, tracker_file)
+    # Create initial vector database
+    vectorstore = rag_pipeline.create_vector_db(force_refresh=True)
+
+    # Verify it was created
+    assert os.path.exists(rag_pipeline.db_dir)
+    assert vectorstore is not None
+    # Check the database has content
+    embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    loaded_db = Chroma(
+        persist_directory=rag_pipeline.db_dir, embedding_function=embeddings
+    )
+    assert loaded_db._collection.count() > 0
+
+
+@pytest.mark.skipif(
+    not shutil.which("ollama"), reason="Ollama not installed or not in PATH"
+)
+def test_update_vector_db_with_changes(docs_dir, db_dir, tracker_file, sample_docs):
+    """Test updating a vector database with document changes"""
+    rag_pipeline = RAG(docs_dir, db_dir, tracker_file)
+    # Create initial vector database
+    rag_pipeline.create_vector_db(force_refresh=True)
+
+    # Get initial count
+    embeddings = OllamaEmbeddings(model="nomic-embed-text")
+    initial_db = Chroma(
+        persist_directory=rag_pipeline.db_dir, embedding_function=embeddings
+    )
+    initial_count = initial_db._collection.count()
+
+    # Make changes to documents
+    # Add a new document
+    create_test_document(
+        docs_dir, "newdoc.txt", "This is a brand new document for testing."
+    )
+
+    # Update the vector database
+    rag_pipeline.create_vector_db()
+
+    # Check the database has been updated
+    updated_db = Chroma(
+        persist_directory=rag_pipeline.db_dir, embedding_function=embeddings
+    )
+    assert updated_db._collection.count() > initial_count
+
+
+# Final integration test - full RAG pipeline
+@pytest.mark.skipif(
+    not shutil.which("ollama"), reason="Ollama not installed or not in PATH"
+)
+def test_full_rag_pipeline(docs_dir, db_dir, tracker_file, sample_docs):
+    """Test the entire RAG pipeline from document processing to querying"""
+    rag_pipeline = RAG(docs_dir, db_dir, tracker_file)
+    # Create a specific document with known content
+    test_content = "Python is a high-level programming language known for its readability and versatility."
+    create_test_document(rag_pipeline.docs_dir, "python_info.txt", test_content)
+
+    # Create vector database
+    rag_pipeline.create_vector_db(force_refresh=True)
+
+    # Set up RAG
+    rag_chain = rag_pipeline.setup_rag(model_name="llama3.2")
+
+    # Query the system
+    query = "What is Python?"
+    response = rag_pipeline.query_rag(rag_chain, query)
+
+    # Check if response contains relevant information
+    # This is a soft test since the exact response will depend on the LLM
+    assert response.strip() != ""
+    assert "programming" in response.lower() or "language" in response.lower()
--- a/tests/utility.py
+++ b/tests/utility.py
@ -0,0 +1,26 @@
+import time
+import os
+
+
+def create_test_document(base_path, filename, content):
+    """Create a test document with specified content"""
+    file_path = os.path.join(base_path, filename)
+    # Create directory if it doesn't exist (for subdirectories)
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w") as f:
+        f.write(content)
+    return file_path
+
+
+def modify_test_document(file_path, new_content):
+    """Modify an existing test document"""
+    with open(file_path, "w") as f:
+        f.write(new_content)
+    # Ensure modification time changes
+    time.sleep(0.1)
+
+
+def delete_test_document(file_path):
+    """Delete a test document"""
+    if os.path.exists(file_path):
+        os.remove(file_path)