Implement indexing of simple tables in Word files (#1651)

2025-10-11 05:36:03 +02:00 · 2024-09-08 18:38:46 +02:00
parent 57c1deb8b8
commit 51a13f5fc7
2 changed files with 37 additions and 3 deletions
--- a/backend/danswer/file_processing/extract_file_text.py
+++ b/backend/danswer/file_processing/extract_file_text.py
@@ -240,9 +240,43 @@ def read_pdf_file(


 def docx_to_text(file: IO[Any]) -> str:
+    def is_simple_table(table: docx.table.Table) -> bool:
+        for row in table.rows:
+            # No omitted cells
+            if row.grid_cols_before > 0 or row.grid_cols_after > 0:
+                return False
+
+            # No nested tables
+            if any(cell.tables for cell in row.cells):
+                return False
+
+        return True
+
+    def extract_cell_text(cell: docx.table._Cell) -> str:
+        cell_paragraphs = [para.text.strip() for para in cell.paragraphs]
+        return " ".join(p for p in cell_paragraphs if p) or "N/A"
+
+    paragraphs = []
    doc = docx.Document(file)
-    full_text = [para.text for para in doc.paragraphs]
-    return TEXT_SECTION_SEPARATOR.join(full_text)
+    for item in doc.iter_inner_content():
+        if isinstance(item, docx.text.paragraph.Paragraph):
+            paragraphs.append(item.text)
+
+        elif isinstance(item, docx.table.Table):
+            if not item.rows or not is_simple_table(item):
+                continue
+
+            # Every row is a new line, joined with a single newline
+            table_content = "\n".join(
+                [
+                    ",\t".join(extract_cell_text(cell) for cell in row.cells)
+                    for row in item.rows
+                ]
+            )
+            paragraphs.append(table_content)
+
+    # Docx already has good spacing between paragraphs
+    return "\n".join(paragraphs)


 def pptx_to_text(file: IO[Any]) -> str:
--- a/backend/requirements/default.txt
+++ b/backend/requirements/default.txt
@@ -50,7 +50,7 @@ python-pptx==0.6.23
 pypdf==3.17.0
 pytest-mock==3.12.0
 pytest-playwright==0.3.2
-python-docx==1.1.0
+python-docx==1.1.2
 python-dotenv==1.0.0
 python-multipart==0.0.7
 pywikibot==9.0.0