forked from open-webui/open-webui
Merge pull request #547 from Marclass/main
feat: Add reStructuredText specific parser for RAG
This commit is contained in:
commit
d517a3ebb4
4 changed files with 26 additions and 13 deletions
|
@ -22,6 +22,7 @@ from langchain_community.document_loaders import (
|
||||||
UnstructuredWordDocumentLoader,
|
UnstructuredWordDocumentLoader,
|
||||||
UnstructuredMarkdownLoader,
|
UnstructuredMarkdownLoader,
|
||||||
UnstructuredXMLLoader,
|
UnstructuredXMLLoader,
|
||||||
|
UnstructuredRSTLoader,
|
||||||
)
|
)
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
|
@ -178,6 +179,8 @@ def store_doc(
|
||||||
loader = Docx2txtLoader(file_path)
|
loader = Docx2txtLoader(file_path)
|
||||||
elif file_ext=="csv":
|
elif file_ext=="csv":
|
||||||
loader = CSVLoader(file_path)
|
loader = CSVLoader(file_path)
|
||||||
|
elif file_ext=="rst":
|
||||||
|
loader = UnstructuredRSTLoader(file_path, mode="elements")
|
||||||
elif file_ext in text_xml:
|
elif file_ext in text_xml:
|
||||||
loader=UnstructuredXMLLoader(file_path)
|
loader=UnstructuredXMLLoader(file_path)
|
||||||
elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
|
elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
import Suggestions from './MessageInput/Suggestions.svelte';
|
import Suggestions from './MessageInput/Suggestions.svelte';
|
||||||
import { uploadDocToVectorDB } from '$lib/apis/rag';
|
import { uploadDocToVectorDB } from '$lib/apis/rag';
|
||||||
import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte';
|
import AddFilesPlaceholder from '../AddFilesPlaceholder.svelte';
|
||||||
import { SUPPORTED_FILE_TYPE } from '$lib/constants';
|
import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
|
||||||
import Documents from './MessageInput/Documents.svelte';
|
import Documents from './MessageInput/Documents.svelte';
|
||||||
import Models from './MessageInput/Models.svelte';
|
import Models from './MessageInput/Models.svelte';
|
||||||
|
|
||||||
|
@ -169,11 +169,13 @@
|
||||||
reader.readAsDataURL(file);
|
reader.readAsDataURL(file);
|
||||||
} else if (
|
} else if (
|
||||||
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
||||||
['md'].includes(file.name.split('.').at(-1))
|
SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
toast.error(
|
||||||
|
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
|
||||||
|
);
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -304,12 +306,14 @@
|
||||||
reader.readAsDataURL(file);
|
reader.readAsDataURL(file);
|
||||||
} else if (
|
} else if (
|
||||||
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
||||||
['md'].includes(file.name.split('.').at(-1))
|
SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
filesInputElement.value = '';
|
filesInputElement.value = '';
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
toast.error(
|
||||||
|
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
|
||||||
|
);
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
filesInputElement.value = '';
|
filesInputElement.value = '';
|
||||||
}
|
}
|
||||||
|
@ -466,8 +470,8 @@
|
||||||
placeholder={chatInputPlaceholder !== ''
|
placeholder={chatInputPlaceholder !== ''
|
||||||
? chatInputPlaceholder
|
? chatInputPlaceholder
|
||||||
: speechRecognitionListening
|
: speechRecognitionListening
|
||||||
? 'Listening...'
|
? 'Listening...'
|
||||||
: 'Send a message'}
|
: 'Send a message'}
|
||||||
bind:value={prompt}
|
bind:value={prompt}
|
||||||
on:keypress={(e) => {
|
on:keypress={(e) => {
|
||||||
if (e.keyCode == 13 && !e.shiftKey) {
|
if (e.keyCode == 13 && !e.shiftKey) {
|
||||||
|
|
|
@ -21,9 +21,11 @@ export const SUPPORTED_FILE_TYPE = [
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
'application/octet-stream',
|
'application/octet-stream',
|
||||||
'application/x-javascript',
|
'application/x-javascript',
|
||||||
'text/markdown',
|
'text/markdown'
|
||||||
];
|
];
|
||||||
|
|
||||||
|
export const SUPPORTED_FILE_EXTENSIONS = ['md', 'rst'];
|
||||||
|
|
||||||
// Source: https://kit.svelte.dev/docs/modules#$env-static-public
|
// Source: https://kit.svelte.dev/docs/modules#$env-static-public
|
||||||
// This feature, akin to $env/static/private, exclusively incorporates environment variables
|
// This feature, akin to $env/static/private, exclusively incorporates environment variables
|
||||||
// that are prefixed with config.kit.env.publicPrefix (usually set to PUBLIC_).
|
// that are prefixed with config.kit.env.publicPrefix (usually set to PUBLIC_).
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
import { documents } from '$lib/stores';
|
import { documents } from '$lib/stores';
|
||||||
import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents';
|
import { createNewDoc, deleteDocByName, getDocs } from '$lib/apis/documents';
|
||||||
|
|
||||||
import { SUPPORTED_FILE_TYPE } from '$lib/constants';
|
import { SUPPORTED_FILE_TYPE, SUPPORTED_FILE_EXTENSIONS } from '$lib/constants';
|
||||||
import { uploadDocToVectorDB } from '$lib/apis/rag';
|
import { uploadDocToVectorDB } from '$lib/apis/rag';
|
||||||
import { transformFileName } from '$lib/utils';
|
import { transformFileName } from '$lib/utils';
|
||||||
|
|
||||||
|
@ -69,11 +69,13 @@
|
||||||
const file = inputFiles[0];
|
const file = inputFiles[0];
|
||||||
if (
|
if (
|
||||||
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
||||||
['md'].includes(file.name.split('.').at(-1))
|
SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
toast.error(
|
||||||
|
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
|
||||||
|
);
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -150,11 +152,13 @@
|
||||||
const file = inputFiles[0];
|
const file = inputFiles[0];
|
||||||
if (
|
if (
|
||||||
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
SUPPORTED_FILE_TYPE.includes(file['type']) ||
|
||||||
['md'].includes(file.name.split('.').at(-1))
|
SUPPORTED_FILE_EXTENSIONS.includes(file.name.split('.').at(-1))
|
||||||
) {
|
) {
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
} else {
|
} else {
|
||||||
toast.error(`Unknown File Type '${file['type']}', but accepting and treating as plain text`);
|
toast.error(
|
||||||
|
`Unknown File Type '${file['type']}', but accepting and treating as plain text`
|
||||||
|
);
|
||||||
uploadDoc(file);
|
uploadDoc(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue