Merge pull request #547 from Marclass/main

feat: Add reStructuredText specific parser for RAG
This commit is contained in:
Timothy Jaeryang Baek 2024-01-22 22:19:41 -08:00 committed by GitHub
commit d517a3ebb4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 26 additions and 13 deletions

View file

@ -22,6 +22,7 @@ from langchain_community.document_loaders import (
UnstructuredWordDocumentLoader, UnstructuredWordDocumentLoader,
UnstructuredMarkdownLoader, UnstructuredMarkdownLoader,
UnstructuredXMLLoader, UnstructuredXMLLoader,
UnstructuredRSTLoader,
) )
from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma from langchain_community.vectorstores import Chroma
@ -178,6 +179,8 @@ def store_doc(
loader = Docx2txtLoader(file_path) loader = Docx2txtLoader(file_path)
elif file_ext=="csv": elif file_ext=="csv":
loader = CSVLoader(file_path) loader = CSVLoader(file_path)
elif file_ext=="rst":
loader = UnstructuredRSTLoader(file_path, mode="elements")
elif file_ext in text_xml: elif file_ext in text_xml:
loader=UnstructuredXMLLoader(file_path) loader=UnstructuredXMLLoader(file_path)
elif file_ext in known_source_ext or file.content_type.find("text/")>=0: elif file_ext in known_source_ext or file.content_type.find("text/")>=0:

View file

@ -8,7 +8,7 @@
import Suggestions from './MessageInput/Suggestions.svelte'; import Suggestions from './MessageInput/Suggestions.svelte';
import { uploadDocToVectorDB } from '$lib/apis/rag'; import { uploadDocToVectorDB } from '$lib/apis/rag';
import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte'; import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte';
import { SUPPORTED_FILE_TYPE } from '$lib/constants'; import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
import Documents from './MessageInput/Documents.svelte'; import Documents from './MessageInput/Documents.svelte';
import Models from './MessageInput/Models.svelte'; import Models from './MessageInput/Models.svelte';
@ -169,11 +169,13 @@
reader.readAsDataURL(file); reader.readAsDataURL(file);
} else if ( } else if (
SUPPORTED_FILE_TYPE.includes(file['type']) || SUPPORTED_FILE_TYPE.includes(file['type']) ||
['md'].includes(file.name.split('.').at(-1)) SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); toast.error(
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
);
uploadDoc(file); uploadDoc(file);
} }
} else { } else {
@ -304,12 +306,14 @@
reader.readAsDataURL(file); reader.readAsDataURL(file);
} else if ( } else if (
SUPPORTED_FILE_TYPE.includes(file['type']) || SUPPORTED_FILE_TYPE.includes(file['type']) ||
['md'].includes(file.name.split('.').at(-1)) SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
) { ) {
uploadDoc(file); uploadDoc(file);
filesInputElement.value = ''; filesInputElement.value = '';
} else { } else {
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); toast.error(
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
);
uploadDoc(file); uploadDoc(file);
filesInputElement.value = ''; filesInputElement.value = '';
} }

View file

@ -21,9 +21,11 @@ export const SUPPORTED_FILE_TYPE = [
'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/octet-stream', 'application/octet-stream',
'application/x-javascript', 'application/x-javascript',
'text/markdown', 'text/markdown'
]; ];
export const SUPPORTED_FILE_EXTENSIONS = ['md', 'rst'];
// Source: https://kit.svelte.dev/docs/modules#$env-static-public // Source: https://kit.svelte.dev/docs/modules#$env-static-public
// This feature, akin to $env/static/private, exclusively incorporates environment variables // This feature, akin to $env/static/private, exclusively incorporates environment variables
// that are prefixed with config.kit.env.publicPrefix (usually set to PUBLIC_). // that are prefixed with config.kit.env.publicPrefix (usually set to PUBLIC_).

View file

@ -7,7 +7,7 @@
import { documents } from '$lib/stores'; import { documents } from '$lib/stores';
import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents'; import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents';
import { SUPPORTED_FILE_TYPE } from '$lib/constants'; import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
import { uploadDocToVectorDB } from '$lib/apis/rag'; import { uploadDocToVectorDB } from '$lib/apis/rag';
import { transformFileName } from '$lib/utils'; import { transformFileName } from '$lib/utils';
@ -69,11 +69,13 @@
const file = inputFiles[0]; const file = inputFiles[0];
if ( if (
SUPPORTED_FILE_TYPE.includes(file['type']) || SUPPORTED_FILE_TYPE.includes(file['type']) ||
['md'].includes(file.name.split('.').at(-1)) SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); toast.error(
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
);
uploadDoc(file); uploadDoc(file);
} }
} else { } else {
@ -150,11 +152,13 @@
const file = inputFiles[0]; const file = inputFiles[0];
if ( if (
SUPPORTED_FILE_TYPE.includes(file['type']) || SUPPORTED_FILE_TYPE.includes(file['type']) ||
['md'].includes(file.name.split('.').at(-1)) SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
) { ) {
uploadDoc(file); uploadDoc(file);
} else { } else {
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`); toast.error(
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
);
uploadDoc(file); uploadDoc(file);
} }