forked from open-webui/open-webui
Merge pull request #554 from Marclass/main
feat: Add excel parser for RAG
This commit is contained in:
commit
d29321f1ec
3 changed files with 89 additions and 38 deletions
|
@ -24,6 +24,7 @@ from langchain_community.document_loaders import (
|
||||||
UnstructuredMarkdownLoader,
|
UnstructuredMarkdownLoader,
|
||||||
UnstructuredXMLLoader,
|
UnstructuredXMLLoader,
|
||||||
UnstructuredRSTLoader,
|
UnstructuredRSTLoader,
|
||||||
|
UnstructuredExcelLoader,
|
||||||
)
|
)
|
||||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.vectorstores import Chroma
|
from langchain_community.vectorstores import Chroma
|
||||||
|
@ -137,6 +138,87 @@ def store_web(form_data: StoreWebForm, user=Depends(get_current_user)):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_loader(file, file_path):
|
||||||
|
file_ext = file.filename.split(".")[-1].lower()
|
||||||
|
known_type = True
|
||||||
|
|
||||||
|
known_source_ext = [
|
||||||
|
"go",
|
||||||
|
"py",
|
||||||
|
"java",
|
||||||
|
"sh",
|
||||||
|
"bat",
|
||||||
|
"ps1",
|
||||||
|
"cmd",
|
||||||
|
"js",
|
||||||
|
"ts",
|
||||||
|
"css",
|
||||||
|
"cpp",
|
||||||
|
"hpp",
|
||||||
|
"h",
|
||||||
|
"c",
|
||||||
|
"cs",
|
||||||
|
"sql",
|
||||||
|
"log",
|
||||||
|
"ini",
|
||||||
|
"pl",
|
||||||
|
"pm",
|
||||||
|
"r",
|
||||||
|
"dart",
|
||||||
|
"dockerfile",
|
||||||
|
"env",
|
||||||
|
"php",
|
||||||
|
"hs",
|
||||||
|
"hsc",
|
||||||
|
"lua",
|
||||||
|
"nginxconf",
|
||||||
|
"conf",
|
||||||
|
"m",
|
||||||
|
"mm",
|
||||||
|
"plsql",
|
||||||
|
"perl",
|
||||||
|
"rb",
|
||||||
|
"rs",
|
||||||
|
"db2",
|
||||||
|
"scala",
|
||||||
|
"bash",
|
||||||
|
"swift",
|
||||||
|
"vue",
|
||||||
|
"svelte",
|
||||||
|
]
|
||||||
|
|
||||||
|
if file_ext == "pdf":
|
||||||
|
loader = PyPDFLoader(file_path)
|
||||||
|
elif file_ext == "csv":
|
||||||
|
loader = CSVLoader(file_path)
|
||||||
|
elif file_ext == "rst":
|
||||||
|
loader = UnstructuredRSTLoader(file_path, mode="elements")
|
||||||
|
elif file_ext == "xml":
|
||||||
|
loader = UnstructuredXMLLoader(file_path)
|
||||||
|
elif file_ext == "md":
|
||||||
|
loader = UnstructuredMarkdownLoader(file_path)
|
||||||
|
elif file.content_type == "application/epub+zip":
|
||||||
|
loader = UnstructuredEPubLoader(file_path)
|
||||||
|
elif (
|
||||||
|
file.content_type
|
||||||
|
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
or file_ext in ["doc", "docx"]
|
||||||
|
):
|
||||||
|
loader = Docx2txtLoader(file_path)
|
||||||
|
elif file.content_type in [
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
] or file_ext in ["xls", "xlsx"]:
|
||||||
|
loader = UnstructuredExcelLoader(file_path)
|
||||||
|
elif file_ext in known_source_ext or file.content_type.find("text/") >= 0:
|
||||||
|
loader = TextLoader(file_path)
|
||||||
|
else:
|
||||||
|
loader = TextLoader(file_path)
|
||||||
|
known_type = False
|
||||||
|
|
||||||
|
return loader, known_type
|
||||||
|
|
||||||
|
|
||||||
@app.post("/doc")
|
@app.post("/doc")
|
||||||
def store_doc(
|
def store_doc(
|
||||||
collection_name: Optional[str] = Form(None),
|
collection_name: Optional[str] = Form(None),
|
||||||
|
@ -146,21 +228,6 @@ def store_doc(
|
||||||
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
|
# "https://www.gutenberg.org/files/1727/1727-h/1727-h.htm"
|
||||||
|
|
||||||
print(file.content_type)
|
print(file.content_type)
|
||||||
|
|
||||||
text_xml=["xml"]
|
|
||||||
octet_markdown=["md"]
|
|
||||||
known_source_ext=[
|
|
||||||
"go", "py", "java", "sh", "bat", "ps1", "cmd", "js", "ts",
|
|
||||||
"css", "cpp", "hpp","h", "c", "cs", "sql", "log", "ini",
|
|
||||||
"pl", "pm", "r", "dart", "dockerfile", "env", "php", "hs",
|
|
||||||
"hsc", "lua", "nginxconf", "conf", "m", "mm", "plsql", "perl",
|
|
||||||
"rb", "rs", "db2", "scala", "bash", "swift", "vue", "svelte"
|
|
||||||
]
|
|
||||||
docx_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
||||||
known_doc_ext=["doc","docx"]
|
|
||||||
file_ext=file.filename.split(".")[-1].lower()
|
|
||||||
known_type=True
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
filename = file.filename
|
filename = file.filename
|
||||||
file_path = f"{UPLOAD_DIR}/{filename}"
|
file_path = f"{UPLOAD_DIR}/{filename}"
|
||||||
|
@ -174,27 +241,7 @@ def store_doc(
|
||||||
collection_name = calculate_sha256(f)[:63]
|
collection_name = calculate_sha256(f)[:63]
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
if file_ext=="pdf":
|
loader, known_type = get_loader(file, file_path)
|
||||||
loader = PyPDFLoader(file_path)
|
|
||||||
elif (file.content_type ==docx_type or file_ext in known_doc_ext):
|
|
||||||
loader = Docx2txtLoader(file_path)
|
|
||||||
elif file_ext=="csv":
|
|
||||||
loader = CSVLoader(file_path)
|
|
||||||
elif file_ext=="rst":
|
|
||||||
loader = UnstructuredRSTLoader(file_path, mode="elements")
|
|
||||||
elif file_ext in text_xml:
|
|
||||||
loader=UnstructuredXMLLoader(file_path)
|
|
||||||
elif file_ext in known_source_ext or file.content_type.find("text/")>=0:
|
|
||||||
loader = TextLoader(file_path)
|
|
||||||
elif file_ext in octet_markdown:
|
|
||||||
loader = UnstructuredMarkdownLoader(file_path)
|
|
||||||
elif file.content_type == "application/epub+zip":
|
|
||||||
loader = UnstructuredEPubLoader(file_path)
|
|
||||||
else:
|
|
||||||
loader = TextLoader(file_path)
|
|
||||||
known_type=False
|
|
||||||
|
|
||||||
|
|
||||||
data = loader.load()
|
data = loader.load()
|
||||||
result = store_data_in_vector_db(data, collection_name)
|
result = store_data_in_vector_db(data, collection_name)
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,10 @@ docx2txt
|
||||||
unstructured
|
unstructured
|
||||||
markdown
|
markdown
|
||||||
pypandoc
|
pypandoc
|
||||||
|
pandas
|
||||||
|
openpyxl
|
||||||
|
pyxlsb
|
||||||
|
xlrd
|
||||||
|
|
||||||
PyJWT
|
PyJWT
|
||||||
pyjwt[crypto]
|
pyjwt[crypto]
|
||||||
|
|
|
@ -31,7 +31,7 @@ export const SUPPORTED_FILE_EXTENSIONS = [
|
||||||
'pl', 'pm', 'r', 'dart', 'dockerfile', 'env', 'php', 'hs',
|
'pl', 'pm', 'r', 'dart', 'dockerfile', 'env', 'php', 'hs',
|
||||||
'hsc', 'lua', 'nginxconf', 'conf', 'm', 'mm', 'plsql', 'perl',
|
'hsc', 'lua', 'nginxconf', 'conf', 'm', 'mm', 'plsql', 'perl',
|
||||||
'rb', 'rs', 'db2', 'scala', 'bash', 'swift', 'vue', 'svelte',
|
'rb', 'rs', 'db2', 'scala', 'bash', 'swift', 'vue', 'svelte',
|
||||||
'doc','docx', 'pdf', 'csv', 'txt'
|
'doc','docx', 'pdf', 'csv', 'txt', 'xls', 'xlsx'
|
||||||
];
|
];
|
||||||
|
|
||||||
// Source: https://kit.svelte.dev/docs/modules#$env-static-public
|
// Source: https://kit.svelte.dev/docs/modules#$env-static-public
|
||||||
|
|
Loading…
Reference in a new issue