Merge pull request #14 from ydennisy/feat/adding-new-vwesion-kg1

feat: the absolute final re-start of this project!
ydennisy · Mar 24, 2024 · 01526d2 · 01526d2
2 parents 606c447 + 0ad819d
commit 01526d2
Show file tree

Hide file tree

Showing 55 changed files with 17,350 additions and 10 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,16 +1,14 @@
 {
-  //"editor.defaultFormatter": "esbenp.prettier-vscode",
-  //"editor.formatOnSave": true,
-  "[javascript]": {
-    "editor.defaultFormatter": "esbenp.prettier-vscode",
-    "editor.formatOnSave": true
-  },
+  "editor.defaultFormatter": "esbenp.prettier-vscode",
+  "editor.formatOnSave": true,
+  "css.customData": [".vscode/tailwind.json"],
   "[typescript]": {
-    "editor.defaultFormatter": "esbenp.prettier-vscode",
-    "editor.formatOnSave": true
+    "editor.defaultFormatter": "esbenp.prettier-vscode"
+  },
+  "[vue]": {
+    "editor.defaultFormatter": "esbenp.prettier-vscode"
   },
   "[python]": {
-    "editor.defaultFormatter": "ms-python.black-formatter",
-    "editor.formatOnSave": true
+    "editor.defaultFormatter": "ms-python.black-formatter"
   }
 }
diff --git a/.vscode/tailwind.json b/.vscode/tailwind.json
@@ -0,0 +1,55 @@
+{
+  "version": 1.1,
+  "atDirectives": [
+    {
+      "name": "@tailwind",
+      "description": "Use the `@tailwind` directive to insert Tailwind's `base`, `components`, `utilities` and `screens` styles into your CSS.",
+      "references": [
+        {
+          "name": "Tailwind Documentation",
+          "url": "https://tailwindcss.com/docs/functions-and-directives#tailwind"
+        }
+      ]
+    },
+    {
+      "name": "@apply",
+      "description": "Use the `@apply` directive to inline any existing utility classes into your own custom CSS. This is useful when you find a common utility pattern in your HTML that you’d like to extract to a new component.",
+      "references": [
+        {
+          "name": "Tailwind Documentation",
+          "url": "https://tailwindcss.com/docs/functions-and-directives#apply"
+        }
+      ]
+    },
+    {
+      "name": "@responsive",
+      "description": "You can generate responsive variants of your own classes by wrapping their definitions in the `@responsive` directive:\n```css\n@responsive {\n  .alert {\n    background-color: #E53E3E;\n  }\n}\n```\n",
+      "references": [
+        {
+          "name": "Tailwind Documentation",
+          "url": "https://tailwindcss.com/docs/functions-and-directives#responsive"
+        }
+      ]
+    },
+    {
+      "name": "@screen",
+      "description": "The `@screen` directive allows you to create media queries that reference your breakpoints by **name** instead of duplicating their values in your own CSS:\n```css\n@screen sm {\n  /* ... */\n}\n```\n…gets transformed into this:\n```css\n@media (min-width: 640px) {\n  /* ... */\n}\n```\n",
+      "references": [
+        {
+          "name": "Tailwind Documentation",
+          "url": "https://tailwindcss.com/docs/functions-and-directives#screen"
+        }
+      ]
+    },
+    {
+      "name": "@variants",
+      "description": "Generate `hover`, `focus`, `active` and other **variants** of your own utilities by wrapping their definitions in the `@variants` directive:\n```css\n@variants hover, focus {\n   .btn-brand {\n    background-color: #3182CE;\n  }\n}\n```\n",
+      "references": [
+        {
+          "name": "Tailwind Documentation",
+          "url": "https://tailwindcss.com/docs/functions-and-directives#variants"
+        }
+      ]
+    }
+  ]
+}
diff --git a/backend/.env.example b/backend/.env.example
@@ -0,0 +1,4 @@
+SUPABASE_URL=http://127.0.0.1:54321
+SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU
+OPENAI_API_KEY=sk...
+TOKENIZERS_PARALLELISM=false
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -0,0 +1,3 @@
+.env
+.venv
+__pycache__
diff --git a/backend/Makefile b/backend/Makefile
@@ -0,0 +1,8 @@
+dev:
+	pipenv run uvicorn src.main:app --reload
+
+supabase-start:
+	supabase start -x inbucket,imgproxy,edge-runtime,realtime
+
+supabase-reset:
+	supabase db reset
diff --git a/backend/Pipfile b/backend/Pipfile
@@ -0,0 +1,27 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+fastapi = "*"
+uvicorn = {extras = ["standard"], version = "*"}
+supabase = "*"
+python-dotenv = "*"
+trafilatura = "*"
+lxml = "*"
+langchain = "*"
+sentence-transformers = "*"
+requests = "*"
+llama-index = "*"
+openai = "*"
+numpy = "*"
+pandas = "*"
+llama-index-readers-file = "*"
+httpx = "*"
+pymupdf = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.11"
diff --git a/backend/Pipfile.lock b/backend/Pipfile.lock
diff --git a/backend/README.md b/backend/README.md
@@ -0,0 +1,3 @@
+# Personal Search Engine (PSE) - Backend
+
+## NOTE! I added some deps for a flight, need to remove if unused.
diff --git a/backend/src/db.py b/backend/src/db.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+from supabase import create_client
+
+if TYPE_CHECKING:
+    from backend.src.domain.node import TextNode
+    from src.domain import URL
+
+
+class DB:
+    def __init__(self) -> None:
+        self._client = create_client(
+            os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY")
+        )
+
+    def create_urls(self, urls: list[URL], user_id: str):
+        data = [{**url.to_persistence(), "user_id": user_id} for url in urls]
+        try:
+            return self._client.table("urls_feed").insert(data).execute()
+        except Exception as ex:
+            print(ex)
+
+    def update_urls(self, urls: list[URL]):
+        for url in urls:
+            try:
+                (
+                    self._client.table("urls_feed")
+                    .update({"status": url.status})
+                    .eq("id", url.id)
+                    .execute()
+                )
+            except Exception as ex:
+                print(f"Failed to update URL with id {url.id}: {ex}")
+
+    def create_text_nodes(self, nodes: list[TextNode]):
+        text_nodes_to_persist = []
+        text_node_chunks_to_persist = []
+        for node in nodes:
+            text_node, text_node_chunks = node.to_persistence()
+            text_nodes_to_persist.append(text_node)
+            text_node_chunks_to_persist.extend(text_node_chunks)
+
+        self._client.table("text_nodes").insert(text_nodes_to_persist).execute()
+        self._client.table("text_node_chunks").insert(
+            text_node_chunks_to_persist
+        ).execute()
diff --git a/backend/src/domain/__init__.py b/backend/src/domain/__init__.py
@@ -0,0 +1,7 @@
+from src.domain.url import URL
+from src.domain.url import URLStatus
+from src.domain.node import TextNode
+from src.domain.node import TextNodeChunk
+
+
+__all__ = ["URL", "URLStatus", "TextNode", "TextNodeChunk"]
diff --git a/backend/src/domain/node.py b/backend/src/domain/node.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+from src.utils.uuid import uuid7
+
+if TYPE_CHECKING:
+    from src.utils.chunker import NodeChunker
+    from src.utils.embedder import NodeEmbedder
+
+
+class TextNodeChunk:
+    def __init__(self, text: str, text_node_id: str) -> None:
+        self.id = uuid7()
+        self.text_node_id = text_node_id
+        self.text = text
+        self.embedding = None
+
+
+class TextNode:
+    def __init__(self, url_feed_id: str, url: str, title: str, text: str) -> None:
+        self.id = uuid7()
+        self.url_feed_id = url_feed_id
+        self.url = url
+        self.title = title
+        self.text = text
+        self.embedding = None
+        self.chunks: list[TextNodeChunk] = []
+
+    def create_chunks(self, chunker: NodeChunker) -> None:
+        self.chunks = chunker.chunk(self.id, self.text)
+
+    def create_embeddings(self, embedder: NodeEmbedder) -> None:
+        texts = [self.url + self.title + self.text] + [
+            chunk.text for chunk in self.chunks
+        ]
+        embeddings = embedder.embed(texts)
+        self.embedding = embeddings[0].tolist()
+        for chunk, embedding in zip(self.chunks, embeddings[1:]):
+            chunk.embedding = embedding.tolist()
+
+    def to_persistence(self):
+        text_node = {
+            "id": self.id,
+            "url_feed_id": self.url_feed_id,
+            "url": self.url,
+            "title": self.title,
+            "text": self.text,
+            "embedding": self.embedding,
+        }
+
+        text_node_chunks = []
+        for chunk in self.chunks:
+            text_node_chunks.append(
+                {
+                    "id": chunk.id,
+                    "text": chunk.text,
+                    "embedding": chunk.embedding,
+                    "text_node_id": self.id,
+                }
+            )
+        return text_node, text_node_chunks
diff --git a/backend/src/domain/url.py b/backend/src/domain/url.py
@@ -0,0 +1,59 @@
+from enum import Enum
+from typing import TypedDict
+
+from src.utils.uuid import uuid7
+
+
+class URLStatus(Enum):
+    RECEIVED_AWAITING_INDEXING = "RECEIVED_AWAITING_INDEXING"
+    INDEXING_SKIPED_AS_RECENT_DUPLICATE = "INDEXING_SKIPED_AS_RECENT_DUPLICATE"
+    INDEXED_SUCCESSFULLY = "INDEXED_SUCCESSFULLY"
+    INDEXING_FAILED = "INDEXING_FAILED"
+
+
+class URLPersistence(TypedDict):
+    id: int
+    url: str
+    raw_url: str
+    status: str
+
+
+class URL:
+    def __init__(self, url: str) -> None:
+        self._id = uuid7()
+        self._url = url
+        self._raw_url = url
+        # TODO: this needs more thought as currently this cleaning can cause failures
+        # self.url = self.clean_url(url)
+        self._status: URLStatus = URLStatus.RECEIVED_AWAITING_INDEXING
+
+    @property
+    def id(self) -> str:
+        return self._id
+
+    @property
+    def url(self) -> str:
+        return self._url
+
+    @property
+    def status(self) -> str:
+        return self._status.value
+
+    def clean_url(self, url: str) -> str:
+        url = url.rstrip("/")
+        url = url.replace("http://", "https://")
+        return url
+
+    def set_indexing_success(self):
+        self._status = URLStatus.INDEXED_SUCCESSFULLY
+
+    def set_indexing_failure(self):
+        self._status = URLStatus.INDEXING_FAILED
+
+    def to_persistence(self) -> URLPersistence:
+        return {
+            "id": self._id,
+            "url": self._url,
+            "raw_url": self._raw_url,
+            "status": self._status.value,
+        }
diff --git a/backend/src/llm.py b/backend/src/llm.py
@@ -0,0 +1,57 @@
+import json
+from typing import List
+from openai import OpenAI
+
+client = OpenAI()
+
+MODEL = "gpt-3.5-turbo-0613"
+
+PROMPT_TEMPLATE = (
+    "A question and context documents are provided below."
+    "If the required information is not available in the context documents to answer the question,"
+    "explain to the user that you are missing the required information."
+    "If you are able to answer make sure you quote the source."
+    "You will be provided a JSON line for each context document, which contains <text>, <title> & <url> keys."
+    "At the END of your reply (not inline), you should list all unique sources using the <title> & <url> keys."
+    "Answer using Markdown."
+    "---------------------\n"
+    "CONTEXT:\n"
+    "{context}"
+    "---------------------\n"
+    "QUESTION:\n"
+    "{question}"
+)
+
+
+def format_chunks(chunks: List[dict]) -> str:
+    result = ""
+    for chunk in chunks:
+        chunk.pop("id")
+        chunk.pop("score")
+        result += f"{json.dumps(chunk)}\n"
+    return result
+
+
+def answer_with_context(chunks: List[dict], question: str) -> str:
+    formatted_chunks = format_chunks(chunks)
+    messages = [
+        {
+            "role": "user",
+            "content": PROMPT_TEMPLATE.format(
+                context=formatted_chunks, question=question
+            ),
+        }
+    ]
+    stream = client.chat.completions.create(
+        messages=messages,
+        model=MODEL,
+        stream=True,
+    )
+    for chunk in stream:
+        content = chunk.choices[0].delta.content
+        if content:
+            yield content
+        else:
+            yield ""
+        # yield chunk.choices[0].delta.content
+    # return chat_completion.choices[0].message.content