Skip to content

Commit

Permalink
Merge pull request #14 from ydennisy/feat/adding-new-vwesion-kg1
Browse files Browse the repository at this point in the history
feat: the absolute final re-start of this project!
  • Loading branch information
ydennisy committed Mar 24, 2024
2 parents 606c447 + 0ad819d commit 01526d2
Show file tree
Hide file tree
Showing 55 changed files with 17,350 additions and 10 deletions.
18 changes: 8 additions & 10 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
{
//"editor.defaultFormatter": "esbenp.prettier-vscode",
//"editor.formatOnSave": true,
"[javascript]": {
"editor.defaultFormatter": "esbenp.prettier-vscode",
"editor.formatOnSave": true
},
"editor.defaultFormatter": "esbenp.prettier-vscode",
"editor.formatOnSave": true,
"css.customData": [".vscode/tailwind.json"],
"[typescript]": {
"editor.defaultFormatter": "esbenp.prettier-vscode",
"editor.formatOnSave": true
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
"[vue]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
"editor.defaultFormatter": "ms-python.black-formatter"
}
}
55 changes: 55 additions & 0 deletions .vscode/tailwind.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"version": 1.1,
"atDirectives": [
{
"name": "@tailwind",
"description": "Use the `@tailwind` directive to insert Tailwind's `base`, `components`, `utilities` and `screens` styles into your CSS.",
"references": [
{
"name": "Tailwind Documentation",
"url": "https://tailwindcss.com/docs/functions-and-directives#tailwind"
}
]
},
{
"name": "@apply",
"description": "Use the `@apply` directive to inline any existing utility classes into your own custom CSS. This is useful when you find a common utility pattern in your HTML that you’d like to extract to a new component.",
"references": [
{
"name": "Tailwind Documentation",
"url": "https://tailwindcss.com/docs/functions-and-directives#apply"
}
]
},
{
"name": "@responsive",
"description": "You can generate responsive variants of your own classes by wrapping their definitions in the `@responsive` directive:\n```css\n@responsive {\n .alert {\n background-color: #E53E3E;\n }\n}\n```\n",
"references": [
{
"name": "Tailwind Documentation",
"url": "https://tailwindcss.com/docs/functions-and-directives#responsive"
}
]
},
{
"name": "@screen",
"description": "The `@screen` directive allows you to create media queries that reference your breakpoints by **name** instead of duplicating their values in your own CSS:\n```css\n@screen sm {\n /* ... */\n}\n```\n…gets transformed into this:\n```css\n@media (min-width: 640px) {\n /* ... */\n}\n```\n",
"references": [
{
"name": "Tailwind Documentation",
"url": "https://tailwindcss.com/docs/functions-and-directives#screen"
}
]
},
{
"name": "@variants",
"description": "Generate `hover`, `focus`, `active` and other **variants** of your own utilities by wrapping their definitions in the `@variants` directive:\n```css\n@variants hover, focus {\n .btn-brand {\n background-color: #3182CE;\n }\n}\n```\n",
"references": [
{
"name": "Tailwind Documentation",
"url": "https://tailwindcss.com/docs/functions-and-directives#variants"
}
]
}
]
}
4 changes: 4 additions & 0 deletions backend/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
SUPABASE_URL=http://127.0.0.1:54321
SUPABASE_KEY=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZS1kZW1vIiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImV4cCI6MTk4MzgxMjk5Nn0.EGIM96RAZx35lJzdJsyH-qQwv8Hdp7fsn3W0YpN81IU
OPENAI_API_KEY=sk...
TOKENIZERS_PARALLELISM=false
3 changes: 3 additions & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.env
.venv
__pycache__
8 changes: 8 additions & 0 deletions backend/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
dev:
pipenv run uvicorn src.main:app --reload

supabase-start:
supabase start -x inbucket,imgproxy,edge-runtime,realtime

supabase-reset:
supabase db reset
27 changes: 27 additions & 0 deletions backend/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
fastapi = "*"
uvicorn = {extras = ["standard"], version = "*"}
supabase = "*"
python-dotenv = "*"
trafilatura = "*"
lxml = "*"
langchain = "*"
sentence-transformers = "*"
requests = "*"
llama-index = "*"
openai = "*"
numpy = "*"
pandas = "*"
llama-index-readers-file = "*"
httpx = "*"
pymupdf = "*"

[dev-packages]

[requires]
python_version = "3.11"
2,642 changes: 2,642 additions & 0 deletions backend/Pipfile.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions backend/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Personal Search Engine (PSE) - Backend

## NOTE! I added some deps for a flight, need to remove if unused.
49 changes: 49 additions & 0 deletions backend/src/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations

import os
from typing import TYPE_CHECKING

from supabase import create_client

if TYPE_CHECKING:
from backend.src.domain.node import TextNode
from src.domain import URL


class DB:
def __init__(self) -> None:
self._client = create_client(
os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY")
)

def create_urls(self, urls: list[URL], user_id: str):
data = [{**url.to_persistence(), "user_id": user_id} for url in urls]
try:
return self._client.table("urls_feed").insert(data).execute()
except Exception as ex:
print(ex)

def update_urls(self, urls: list[URL]):
for url in urls:
try:
(
self._client.table("urls_feed")
.update({"status": url.status})
.eq("id", url.id)
.execute()
)
except Exception as ex:
print(f"Failed to update URL with id {url.id}: {ex}")

def create_text_nodes(self, nodes: list[TextNode]):
text_nodes_to_persist = []
text_node_chunks_to_persist = []
for node in nodes:
text_node, text_node_chunks = node.to_persistence()
text_nodes_to_persist.append(text_node)
text_node_chunks_to_persist.extend(text_node_chunks)

self._client.table("text_nodes").insert(text_nodes_to_persist).execute()
self._client.table("text_node_chunks").insert(
text_node_chunks_to_persist
).execute()
7 changes: 7 additions & 0 deletions backend/src/domain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from src.domain.url import URL
from src.domain.url import URLStatus
from src.domain.node import TextNode
from src.domain.node import TextNodeChunk


__all__ = ["URL", "URLStatus", "TextNode", "TextNodeChunk"]
61 changes: 61 additions & 0 deletions backend/src/domain/node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
from __future__ import annotations
from typing import TYPE_CHECKING

from src.utils.uuid import uuid7

if TYPE_CHECKING:
from src.utils.chunker import NodeChunker
from src.utils.embedder import NodeEmbedder


class TextNodeChunk:
def __init__(self, text: str, text_node_id: str) -> None:
self.id = uuid7()
self.text_node_id = text_node_id
self.text = text
self.embedding = None


class TextNode:
def __init__(self, url_feed_id: str, url: str, title: str, text: str) -> None:
self.id = uuid7()
self.url_feed_id = url_feed_id
self.url = url
self.title = title
self.text = text
self.embedding = None
self.chunks: list[TextNodeChunk] = []

def create_chunks(self, chunker: NodeChunker) -> None:
self.chunks = chunker.chunk(self.id, self.text)

def create_embeddings(self, embedder: NodeEmbedder) -> None:
texts = [self.url + self.title + self.text] + [
chunk.text for chunk in self.chunks
]
embeddings = embedder.embed(texts)
self.embedding = embeddings[0].tolist()
for chunk, embedding in zip(self.chunks, embeddings[1:]):
chunk.embedding = embedding.tolist()

def to_persistence(self):
text_node = {
"id": self.id,
"url_feed_id": self.url_feed_id,
"url": self.url,
"title": self.title,
"text": self.text,
"embedding": self.embedding,
}

text_node_chunks = []
for chunk in self.chunks:
text_node_chunks.append(
{
"id": chunk.id,
"text": chunk.text,
"embedding": chunk.embedding,
"text_node_id": self.id,
}
)
return text_node, text_node_chunks
59 changes: 59 additions & 0 deletions backend/src/domain/url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from enum import Enum
from typing import TypedDict

from src.utils.uuid import uuid7


class URLStatus(Enum):
RECEIVED_AWAITING_INDEXING = "RECEIVED_AWAITING_INDEXING"
INDEXING_SKIPED_AS_RECENT_DUPLICATE = "INDEXING_SKIPED_AS_RECENT_DUPLICATE"
INDEXED_SUCCESSFULLY = "INDEXED_SUCCESSFULLY"
INDEXING_FAILED = "INDEXING_FAILED"


class URLPersistence(TypedDict):
id: int
url: str
raw_url: str
status: str


class URL:
def __init__(self, url: str) -> None:
self._id = uuid7()
self._url = url
self._raw_url = url
# TODO: this needs more thought as currently this cleaning can cause failures
# self.url = self.clean_url(url)
self._status: URLStatus = URLStatus.RECEIVED_AWAITING_INDEXING

@property
def id(self) -> str:
return self._id

@property
def url(self) -> str:
return self._url

@property
def status(self) -> str:
return self._status.value

def clean_url(self, url: str) -> str:
url = url.rstrip("/")
url = url.replace("http://", "https://")
return url

def set_indexing_success(self):
self._status = URLStatus.INDEXED_SUCCESSFULLY

def set_indexing_failure(self):
self._status = URLStatus.INDEXING_FAILED

def to_persistence(self) -> URLPersistence:
return {
"id": self._id,
"url": self._url,
"raw_url": self._raw_url,
"status": self._status.value,
}
57 changes: 57 additions & 0 deletions backend/src/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import json
from typing import List
from openai import OpenAI

client = OpenAI()

MODEL = "gpt-3.5-turbo-0613"

PROMPT_TEMPLATE = (
"A question and context documents are provided below."
"If the required information is not available in the context documents to answer the question,"
"explain to the user that you are missing the required information."
"If you are able to answer make sure you quote the source."
"You will be provided a JSON line for each context document, which contains <text>, <title> & <url> keys."
"At the END of your reply (not inline), you should list all unique sources using the <title> & <url> keys."
"Answer using Markdown."
"---------------------\n"
"CONTEXT:\n"
"{context}"
"---------------------\n"
"QUESTION:\n"
"{question}"
)


def format_chunks(chunks: List[dict]) -> str:
result = ""
for chunk in chunks:
chunk.pop("id")
chunk.pop("score")
result += f"{json.dumps(chunk)}\n"
return result


def answer_with_context(chunks: List[dict], question: str) -> str:
formatted_chunks = format_chunks(chunks)
messages = [
{
"role": "user",
"content": PROMPT_TEMPLATE.format(
context=formatted_chunks, question=question
),
}
]
stream = client.chat.completions.create(
messages=messages,
model=MODEL,
stream=True,
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
yield content
else:
yield ""
# yield chunk.choices[0].delta.content
# return chat_completion.choices[0].message.content
Loading

0 comments on commit 01526d2

Please sign in to comment.