fix: evals for ask_holmes are green. evals for investigations are WIP

robusta-dev · Jan 24, 2025 · d2aa404 · d2aa404
1 parent 82f9384
commit d2aa404
Show file tree

Hide file tree

Showing 129 changed files with 1,773 additions and 529 deletions.
diff --git a/holmes/core/investigation_structured_output.py b/holmes/core/investigation_structured_output.py
@@ -6,7 +6,8 @@
     "Conclusions and Possible Root causes": "what conclusions can you reach based on the data you found? what are possible root causes (if you have enough conviction to say) or what uncertainty remains",
     "Next Steps": "what you would do next to troubleshoot this issue, any commands that could be run to fix it, or other ways to solve it (prefer giving precise bash commands when possible)",
     "Related logs": "Truncate and share the most relevant logs, especially if these explain the root cause. For example: \nLogs from pod robusta-holmes:\n```\n<logs>```\n",
-    "App or Infra": "Explain whether the issue is more likely at the infrastructure or at the application level and why you think that. Say you don't know if you are not sure."
+    "App or Infra?": "Explain whether the issue is more likely an infrastructure or an application level issue and why you think that.",
+    "External links": "Provide links to external sources. Where to look when investigating this issue. For example provide links to relevant runbooks, etc. Add a short sentence describing each link."
 }
 
 def get_output_format_for_investigation(sections: Dict[str, str]) -> Dict[str, Any]:

diff --git a/holmes/core/models.py b/holmes/core/models.py
@@ -1,12 +1,12 @@
 from holmes.core.tool_calling_llm import ToolCallResult
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Union
 from pydantic import BaseModel, model_validator
 from enum import Enum
 
 
 class InvestigationResult(BaseModel):
     analysis: Optional[str] = None
-    sections: Optional[Dict[str, str]] = None
+    sections: Optional[Dict[str, Union[str, None]]] = None
     tool_calls: List[ToolCallResult] = []
     instructions: List[str] = []
 

diff --git a/holmes/core/tool_calling_llm.py b/holmes/core/tool_calling_llm.py
@@ -7,9 +7,7 @@
 from holmes.core.performance_timing import PerformanceTiming
 from holmes.utils.tags import format_tags_in_string, parse_messages_tags
 from holmes.plugins.prompts import load_and_render_prompt
-from typing import List, Optional
 from holmes.core.llm import LLM
-from holmes.plugins.prompts import load_and_render_prompt
 from openai import BadRequestError
 from openai._types import NOT_GIVEN
 from openai.types.chat.chat_completion_message_tool_call import (
@@ -32,7 +30,7 @@ class ToolCallResult(BaseModel):
 
 class LLMResult(BaseModel):
     tool_calls: Optional[List[ToolCallResult]] = None
-    sections: Optional[Dict[str, str]] = None
+    sections: Optional[Dict[str, Union[str, None]]] = None
     result: Optional[str] = None
     unprocessed_result: Optional[str] = None
     instructions: List[str] = []

diff --git a/holmes/plugins/prompts/_general_instructions.jinja2 b/holmes/plugins/prompts/_general_instructions.jinja2
@@ -13,6 +13,7 @@ In general:
 * if you don't know, say that the analysis was inconclusive.
 * if there are multiple possible causes list them in a numbered list.
 * there will often be errors in the data that are not relevant or that do not have an impact - ignore them in your conclusion if you were not able to tie them to an actual error.
+* Always check a pod's logs when checking if it is healthy. Don't assume that because the pod is running and reports healthy that it is running without issues.
 
 If investigating Kubernetes problems:
 * run as many kubectl commands as you need to gather more information, then respond.
@@ -21,6 +22,7 @@ If investigating Kubernetes problems:
 * if the user wants to find a specific term in a pod's logs, use kubectl_logs_grep
 * use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
 * when investigating a pod that crashed or application errors, always run kubectl_describe and fetch the logs
+* if a pod has multiple containers, make sure you fetch the logs for either all or relevant containers using one of the containers log functions like kubectl_logs_all_containers, kubectl_logs_all_containers_grep or any other.
 * do not give an answer like "The pod is pending" as that doesn't state why the pod is pending and how to fix it.
 * do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
 * if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
@@ -44,7 +46,7 @@ If Helm tools are unavailable, skip this step.
  - Check for a cluster role with <RELEASE_NAME>-holmes-cluster-role in its name and a service account with <RELEASE_NAME>-holmes-service-account in its name to troubleshoot missing permissions where release name is the name you found earlier if helm tools are available (If the exact cluster role or service account isn't found, search for similar or related names, including variations or prefixes/suffixes that might be used in the cluster.)
  - Focus on identifying absent permissions that align with the error message.
 4. **Update the Configuration**
-If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows: 
+If necessary permissions are absent both in customClusterRoleRules and the cluster role mentioned previously, ALWAYS advise the user to update their configuration by modifying the `generated_values.yaml` file as follows:
 ```
 holmes:
     customClusterRoleRules:
@@ -64,7 +66,7 @@ Reminder:
 * Strive for thoroughness and precision, ensuring the issue is fully addressed.
 
 Special cases and how to reply:
-* if you are unable to investigate something properly because you do not have tools that let you access the right data, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
+* if you lack tools to access the right data or don't have access to a system, explicitly tell the user that you are missing an integration to access XYZ which you would need to investigate. you should give an answer similar to "I don't have access to <details>. Please add a Holmes integration for <XYZ> so that I can investigate this."
 * make sure you differentiate between "I investigated and found error X caused this problem" and "I tried to investigate but while investigating I got some errors that prevented me from completing the investigation."
 * as a special case of that, If a tool generates a permission error when attempting to run it, follow the Handling Permission Errors section for detailed guidance.
 * that is different than - for example - fetching a pod's logs and seeing that the pod itself has permission errors. in that case, you explain say that permission errors are the cause of the problem and give details

diff --git a/holmes/plugins/toolsets/kubernetes_logs.yaml b/holmes/plugins/toolsets/kubernetes_logs.yaml
@@ -11,7 +11,7 @@ toolsets:
     tools:
       - name: "kubectl_previous_logs"
         description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash. Never give a deployment name or a resource that is not a pod."
-        command: "kubectl logs {{ name}} -n {{ namespace }} --previous"
+        command: "kubectl logs {{pod_name}} -n {{ namespace }} --previous"
 
       - name: "kubectl_previous_logs_all_containers"
         description: "Run `kubectl logs --previous` on a single Kubernetes pod. Used to fetch logs for a pod that crashed and see logs from before the crash."
@@ -23,7 +23,7 @@ toolsets:
 
       - name: "kubectl_logs"
         description: "Run `kubectl logs` on a single Kubernetes pod. Never give a deployment name or a resource that is not a pod."
-        command: "kubectl logs {{name}} -n {{ namespace }}"
+        command: "kubectl logs {{pod_name}} -n {{ namespace }}"
 
       - name: "kubectl_logs_all_containers"
         description: "Run `kubectl logs` on all containers within a single Kubernetes pod."
@@ -35,4 +35,8 @@ toolsets:
 
       - name: "kubectl_logs_grep"
         description: "Search for a specific term in the logs of a single Kubernetes pod. Only provide a pod name, not a deployment or other resource."
-        command: "kubectl logs {{ name }} -n {{ namespace }} | grep {{ search_term }}"
+        command: "kubectl logs {{ pod_name }} -n {{ namespace }} | grep {{ search_term }}"
+
+      - name: "kubectl_logs_all_containers_grep"
+        description: "Search for a specific term in the logs of a single Kubernetes pod across all of its containers. Only provide a pod name, not a deployment or other resource."
+        command: "kubectl logs {{pod_name}} -n {{ namespace }} --all-containers | grep {{ search_term }}"
diff --git a/tests/llm/README.md b/tests/llm/README.md
@@ -46,7 +46,7 @@ Here are the possible fields in the `test_case.yaml` yaml file:
 | retrieval_context | List[str]        | Optional          | - pod xyz is running and healthy - there are no errors in the logs          | Context that the LLM is expected to have used in its answer. If present, this generates a 'context' score proportional to the number of matching context elements found in the LLM's output.                           |
 | evaluation        | Dict[str, float] | Optional          | evaluation: <br/>   faithfulness: 1  <br/>  context: 1  <br/>               | The minimum expected scores. The test will fail unless these are met. Set to 0 for unstable tests.                                                                                                                     |
 | before-test       | str              | Optional          | kubectl apply -f manifest.yaml                                              | A command to run before the LLM evaluation. The CWD for this command is the same folder as the fixture. This step is skipped unless `RUN_LIVE` environment variable is set                                             |
-| after-test        | str              | Optional          | kubectl delete -f manifest.yaml                                             | A command to run after the LLM evaluation.The CWD for this command is the same folder as the fixture. Typically cleans up any before-test action. This step is skipped unless  `RUN_LIVE`  environment variable is set |
+| after-test        | str              | Optional          | kubectl delete -f manifest.yaml                                             | A command to run after the LLM evaluation.The CWD for this command is the same folder as the fixture. Typically cleans up any before-test action. This step is skipped unless `RUN_LIVE`  environment variable is set  |
 | generate_mocks    | bool             | Optional          | True                                                                        | Whether the test suite should generate mock files. Existing mock files are overwritten.                                                                                                                                |
 
 
@@ -58,7 +58,7 @@ Run the following:
 UPLOAD_DATASET=1 RUN_LIVE=1 poetry run pytest ./tests/llm/test_ask_holmes.py -k 999_my_test_case
 ```
 
-The test may pass or not based on whether the evaluation scores are high enough. If the test fail,
+The test may pass or not based on whether the evaluation scores are high enough.
 
 # Environment variables
 

diff --git a/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/01_how_many_pods/test_case.yaml
@@ -8,5 +8,4 @@ retrieval_context:
   - One of the pod is in running state but has a container in the process of starting or restarting
   - 6 pods are not running as indicated by the STATUS column
 evaluation:
-  answer_relevancy: 0
-  faithfulness: 0
+  correctness: 0
diff --git a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/kubectl_logs.txt b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/kubectl_logs.txt
@@ -1,4 +1,4 @@
-{"toolset_name":"kubernetes/logs","tool_name":"kubectl_logs","match_params":{"name":"robusta-runner-6958c5bdd8-69gtn","namespace":"default"}}
+{"toolset_name":"kubernetes/logs","tool_name":"kubectl_logs","match_params":{"pod_name":"robusta-runner-6958c5bdd8-69gtn","namespace":"default"}}
 setting up colored logging
 2024-11-04 09:29:28.636 INFO     logger initialized using INFO log level
 2024-11-04 09:29:28.640 INFO     Creating hikaru monkey patches

diff --git a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/kubectl_previous_logs.txt b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/kubectl_previous_logs.txt
@@ -1,2 +1,2 @@
-{"toolset_name":"kubernetes/logs","tool_name":"kubectl_previous_logs","match_params":{"name":"robusta-runner-6958c5bdd8-69gtn","namespace":"default"}}
+{"toolset_name":"kubernetes/logs","tool_name":"kubectl_previous_logs","match_params":{"pod_name":"robusta-runner-6958c5bdd8-69gtn","namespace":"default"}}
 Error from server (BadRequest): previous terminated container "runner" in pod "robusta-runner-6958c5bdd8-69gtn" not found
diff --git a/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/02_what_is_wrong_with_pod/test_case.yaml
@@ -1,8 +1,8 @@
 user_prompt: 'What is wrong with << { "type": "pod", "name": "robusta-runner-6958c5bdd8-69gtn" } >> ?'
 expected_output:
-  - The pod was recently killed due to it being out of memory
-  - The pod is currently running and healthy
+  - The pod was killed due to it being out of memory
 retrieval_context:
+  - The pod is currently running and healthy
   - |
     Last State:   Terminate
     Reason:       OOMKilled
@@ -13,3 +13,5 @@ retrieval_context:
       Message  ----     ------     ----               ----               -------
       Warning  OOMKilled  1m                 kubelet
       Container test-container in pod test-pod was killed due to out of memory
+evaluation:
+  correctness: 1
diff --git a/tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/03_what_is_the_command_to_port_forward/test_case.yaml
@@ -7,4 +7,4 @@ retrieval_context:
   - "The name of the grafana service is my_grafana_4j981"
   - "Grafana is running on port 3000"
 evaluation:
-  faithfulness: 0.5
+  correctness: 1
diff --git a/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/04_related_k8s_events/test_case.yaml
@@ -17,5 +17,4 @@ retrieval_context:
     Normal  Created    4m28s  kubelet        	Created container nginxreplica
     Normal  Started    4m28s  kubelet        	Started container nginxreplica
 evaluation:
-  answer_relevancy: 0
-  faithfulness: 0
+  correctness: 1
diff --git a/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/05_image_version/test_case.yaml
@@ -4,3 +4,6 @@ expected_output:
   - The image version of the `robusta-runner` pod is `0.18.0`.
 retrieval_context:
   - "image: robustadev/robusta-runner:0.18.0"
+evaluation:
+  correctness: 1
+generate_mocks: True
diff --git a/tests/llm/fixtures/test_ask_holmes/06_explain_issue/fetch_finding_by_id.txt b/tests/llm/fixtures/test_ask_holmes/06_explain_issue/fetch_finding_by_id.txt
@@ -1,4 +1,4 @@
-{"toolset_name":"findings","tool_name":"fetch_finding_by_id","match_params":{"id":"asodfkq1209edyhqawdo2uydqawidh"}}
+{"toolset_name":"robusta","tool_name":"fetch_finding_by_id","match_params":{"id":"asodfkq1209edyhqawdo2uydqawidh"}}
 {
     "id": "asodfkq1209edyhqawdo2uydqawidh",
     "description": "The node was low on resource: ephemeral-storage. Threshold quantity: 2126213140, available: 1884540Ki. Container kafka was using 3264Ki, request is 0, has larger consumption of ephemeral-storage. ",

diff --git a/tests/llm/fixtures/test_ask_holmes/06_explain_issue/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/06_explain_issue/test_case.yaml
@@ -7,5 +7,5 @@ retrieval_context:
   - "Pod my-cluster-kafka was evicted"
   - "The node was low on resource: ephemeral-storage. Threshold quantity: 2126213140, available: 1884540Ki. Container kafka was using 3264Ki, request is 0, has larger consumption of ephemeral-storage."
 evaluation:
-  answer_relevancy: 0
-  faithfulness: 0
+  correctness: 1
+generate_mocks: True
diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/kubectl_logs.txt b/tests/llm/fixtures/test_ask_holmes/07_high_latency/kubectl_logs.txt
@@ -1,4 +1,4 @@
-{"toolset_name":"kubernetes/logs","tool_name":"kubectl_logs","match_params":{"name":"customer-orders-766b65899b-lwqnt","namespace":"default"}}
+{"toolset_name":"kubernetes/logs","tool_name":"kubectl_logs","match_params":{"pod_name":"customer-orders-766b65899b-lwqnt","namespace":"default"}}
 stdout:
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.

diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/kubectl_previous_logs.txt b/tests/llm/fixtures/test_ask_holmes/07_high_latency/kubectl_previous_logs.txt
@@ -1,4 +1,4 @@
-{"toolset_name":"kubernetes/logs","tool_name":"kubectl_previous_logs","match_params":{"name":"customer-orders-766b65899b-lwqnt","namespace":"default"}}
+{"toolset_name":"kubernetes/logs","tool_name":"kubectl_previous_logs","match_params":{"pod_name":"customer-orders-766b65899b-lwqnt","namespace":"default"}}
 Command `kubectl logs customer-orders-766b65899b-lwqnt -n default --previous` failed with return code 1
 stdout:
 

diff --git a/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/07_high_latency/test_case.yaml
@@ -4,3 +4,5 @@ expected_output:
   - Database operations are taking between 5 to 10 seconds to complete
 before_test: kubectl apply -f ./helm/manifest.yaml
 after_test: kubectl delete -f ./helm/manifest.yaml
+evaluation:
+  correctness: 1
diff --git a/tests/llm/fixtures/test_ask_holmes/08_sock_shop_frontend/helm/trigger-carts-issue.yaml b/tests/llm/fixtures/test_ask_holmes/08_sock_shop_frontend/helm/trigger-carts-issue.yaml
@@ -17,7 +17,7 @@ spec:
               nslookup front-end.sock-shop.svc.cluster.local
 
               # this works, but not if triggered too frequently (then the svc might not have ready endpoints so communicating with it fails?)
-              SERVICE_URL="10.96.104.145"
+              SERVICE_URL="10.96.34.222"
               echo "Logging in"
               curl -4 -u Eve_Berger:eve -c cookie.txt ${SERVICE_URL}/login
               echo "Logged in"