mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-10-11 05:36:03 +02:00
Implement indexing of simple tables in Word files (#1651)
This commit is contained in:
@@ -240,9 +240,43 @@ def read_pdf_file(
|
||||
|
||||
|
||||
def docx_to_text(file: IO[Any]) -> str:
|
||||
def is_simple_table(table: docx.table.Table) -> bool:
|
||||
for row in table.rows:
|
||||
# No omitted cells
|
||||
if row.grid_cols_before > 0 or row.grid_cols_after > 0:
|
||||
return False
|
||||
|
||||
# No nested tables
|
||||
if any(cell.tables for cell in row.cells):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def extract_cell_text(cell: docx.table._Cell) -> str:
|
||||
cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
|
||||
return " ".join(p for p in cell_paragraphs if p) or "N/A"
|
||||
|
||||
paragraphs = []
|
||||
doc = docx.Document(file)
|
||||
full_text = [para.text for para in doc.paragraphs]
|
||||
return TEXT_SECTION_SEPARATOR.join(full_text)
|
||||
for item in doc.iter_inner_content():
|
||||
if isinstance(item, docx.text.paragraph.Paragraph):
|
||||
paragraphs.append(item.text)
|
||||
|
||||
elif isinstance(item, docx.table.Table):
|
||||
if not item.rows or not is_simple_table(item):
|
||||
continue
|
||||
|
||||
# Every row is a new line, joined with a single newline
|
||||
table_content = "\n".join(
|
||||
[
|
||||
",\t".join(extract_cell_text(cell) for cell in row.cells)
|
||||
for row in item.rows
|
||||
]
|
||||
)
|
||||
paragraphs.append(table_content)
|
||||
|
||||
# Docx already has good spacing between paragraphs
|
||||
return "\n".join(paragraphs)
|
||||
|
||||
|
||||
def pptx_to_text(file: IO[Any]) -> str:
|
||||
|
@@ -50,7 +50,7 @@ python-pptx==0.6.23
|
||||
pypdf==3.17.0
|
||||
pytest-mock==3.12.0
|
||||
pytest-playwright==0.3.2
|
||||
python-docx==1.1.0
|
||||
python-docx==1.1.2
|
||||
python-dotenv==1.0.0
|
||||
python-multipart==0.0.7
|
||||
pywikibot==9.0.0
|
||||
|
Reference in New Issue
Block a user