From 1c10f5429400b2b84d06017f1771f4b2b9a0b6cd Mon Sep 17 00:00:00 2001
From: Yuhong Sun <yuhongsun96@gmail.com>
Date: Wed, 14 Aug 2024 11:04:28 -0700
Subject: [PATCH] GPU Model Server (#2135)

---
 backend/model_server/custom_models.py       | 16 ++++++++++++----
 backend/model_server/danswer_torch_model.py | 20 +++++++++++++-------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/backend/model_server/custom_models.py b/backend/model_server/custom_models.py
index 35a991629..fa0fa60a3 100644
--- a/backend/model_server/custom_models.py
+++ b/backend/model_server/custom_models.py
@@ -66,22 +66,30 @@ def warm_up_intent_model() -> None:
         MODEL_WARM_UP_STRING, return_tensors="pt", truncation=True, padding=True
     )
     intent_model = get_local_intent_model()
-    intent_model(query_ids=tokens["input_ids"], query_mask=tokens["attention_mask"])
+    device = intent_model.device
+    intent_model(
+        query_ids=tokens["input_ids"].to(device),
+        query_mask=tokens["attention_mask"].to(device),
+    )
 
 
 @simple_log_function_time()
 def run_inference(tokens: BatchEncoding) -> tuple[list[float], list[float]]:
     intent_model = get_local_intent_model()
+    device = intent_model.device
 
     outputs = intent_model(
-        query_ids=tokens["input_ids"], query_mask=tokens["attention_mask"]
+        query_ids=tokens["input_ids"].to(device),
+        query_mask=tokens["attention_mask"].to(device),
     )
 
     token_logits = outputs["token_logits"]
     intent_logits = outputs["intent_logits"]
-    intent_probabilities = F.softmax(intent_logits, dim=-1).numpy()[0]
 
-    token_probabilities = F.softmax(token_logits, dim=-1).numpy()[0]
+    # Move tensors to CPU before applying softmax and converting to numpy
+    intent_probabilities = F.softmax(intent_logits.cpu(), dim=-1).numpy()[0]
+    token_probabilities = F.softmax(token_logits.cpu(), dim=-1).numpy()[0]
+
     # Extract the probabilities for the positive class (index 1) for each token
     token_positive_probs = token_probabilities[:, 1].tolist()
 
diff --git a/backend/model_server/danswer_torch_model.py b/backend/model_server/danswer_torch_model.py
index abd28c3e9..28554a4fd 100644
--- a/backend/model_server/danswer_torch_model.py
+++ b/backend/model_server/danswer_torch_model.py
@@ -23,6 +23,8 @@ class HybridClassifier(nn.Module):
         self.intent_classifier = nn.Linear(self.distilbert.config.dim, 2)
         self.dropout = nn.Dropout(self.distilbert.config.seq_classif_dropout)
 
+        self.device = torch.device("cpu")
+
     def forward(
         self,
         query_ids: torch.Tensor,
@@ -51,17 +53,21 @@ class HybridClassifier(nn.Module):
             config = json.load(f)
         model = cls(**config)
 
-        if torch.cuda.is_available():
+        if torch.backends.mps.is_available():
+            # Apple silicon GPU
+            device = torch.device("mps")
+        elif torch.cuda.is_available():
             device = torch.device("cuda")
-            model.load_state_dict(torch.load(model_path, map_location=device))
-            model = model.to(device)
-
         else:
-            # No cuda, model most likely just loaded on CPU
-            model.load_state_dict(torch.load(model_path))
+            device = torch.device("cpu")
+
+        model.load_state_dict(torch.load(model_path, map_location=device))
+        model = model.to(device)
+
+        model.device = device
 
         model.eval()
-
+        # Eval doesn't set requires_grad to False, do it manually to save memory and have faster inference
         for param in model.parameters():
             param.requires_grad = False