microsoft · KartikVashishta · Jun 25, 2025
@@ -108,15 +108,21 @@ def _extract_edges(
     text_units_df = text_units_df.assign(edges=all_edges)  # type: ignore
     edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]]
 
-    edge_df[["source", "target"]] = edge_df.loc[:, "edges"].to_list()
+    # convert tuples/lists into a two-column DataFrame, padding with NaNs as needed
+    edges_list = edge_df["edges"].apply(
+        lambda tup: list(tup) if isinstance(tup, (list, tuple)) else [np.nan, np.nan]
+    )
+    edges_expanded = pd.DataFrame(edges_list.tolist(), index=edge_df.index, columns=["source", "target"]).iloc[:, :2]
+    edge_df = pd.concat([edge_df.drop(columns=["edges"]), edges_expanded], axis=1)
+    edge_df = edge_df.dropna(subset=["source", "target"]) # drop any rows with invalid source & target
+
     edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
     edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
     edge_df = edge_df.drop(columns=["source", "target"]).rename(
         columns={"min_source": "source", "max_target": "target"}  # type: ignore
     )
-
-    edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
-    edge_df = edge_df.drop(columns=["edges"])
+    # edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
+    # edge_df = edge_df.drop(columns=["edges"])
     # group by source and target, count the number of text units
     grouped_edge_df = (
         edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()

@@ -35,4 +35,8 @@ def graph_to_dataframes(
     if edge_columns:
         edges = edges.loc[:, edge_columns]
 
+    # ndarray embeddings to be stored as list objects to avoid broadcasting errors
+    if "embedding" in nodes.columns and isinstance(nodes["embedding"].iloc[0], (list, tuple)) is False:
+        nodes["embedding"] = nodes["embedding"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else x)
+
     return (nodes, edges)
@@ -29,6 +29,10 @@ def prune_graph(
     # remove ego nodes if needed
     degree = cast("DegreeView", graph.degree)
     degrees = list(degree())  # type: ignore
+    # empty graph with nothing to prune
+    if not degrees:
+        return graph
+
     if remove_ego_nodes:
         # ego node is one with highest degree
         ego_node = max(degrees, key=lambda x: x[1])

@@ -69,8 +69,12 @@ def prune_graph(
 
     # subset the full nodes and edges to only include the pruned remainders
     subset_entities = pruned_nodes.merge(entities, on="title", how="inner")
+    pruned_edges["source"] = pruned_edges["source"].astype(str)
+    pruned_edges["target"] = pruned_edges["target"].astype(str)
+    entities["title"] = entities["title"].astype(str)
+
     subset_relationships = pruned_edges.merge(
-        relationships, on=["source", "target"], how="inner"
+        entities, how="inner", left_on="source", right_on="title"
     )
 
     return (subset_entities, subset_relationships)
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""Basic Search prompts."""
-
-BASIC_SEARCH_SYSTEM_PROMPT = """
 ---Role---
 
 You are a helpful assistant responding to questions about data in the tables provided.
@@ -70,4 +65,3 @@
 {response_type}
 
 Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
-"""
@@ -1,8 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
-"""A file containing prompts definition."""
 
-COMMUNITY_REPORT_PROMPT = """
 You are an AI assistant that helps a human analyst to perform general information discovery. Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.
 
 # Goal
@@ -150,4 +146,4 @@
 
 Limit the total report length to {max_report_length} words.
 
-Output:"""
+Output:
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""A file containing prompts definition."""
-
-COMMUNITY_REPORT_TEXT_PROMPT = """
 You are an AI assistant that helps a human analyst to perform general information discovery.
 Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.
 
@@ -69,15 +64,21 @@
     "findings": [
         {{
             "summary": "Risk Management Operational Scope",
-            "explanation": "The Risk Management Department at Enron plays a pivotal role in identifying, assessing, and mitigating financial risks. Their proactive approach, highlighted from the beginning of 2000, helps safeguard Enron against potential financial pitfalls and ensures continuous compliance with evolving market regulations. Effective risk management not only prevents financial anomalies but also supports the company's strategic decision-making processes.\n\n[Data: Sources (2, 3), Date_Range ((2000, 01, 01), (2000, 07, 12))]"
+            "explanation": "The Risk Management Department at Enron plays a pivotal role in identifying, assessing, and mitigating financial risks. Their proactive approach, highlighted from the beginning of 2000, helps safeguard Enron against potential financial pitfalls and ensures continuous compliance with evolving market regulations. Effective risk management not only prevents financial anomalies but also supports the company's strategic decision-making processes.
+
+[Data: Sources (2, 3), Date_Range ((2000, 01, 01), (2000, 07, 12))]"
         }},
         {{
             "summary": "Legal Compliance and Governance",
-            "explanation": "The Legal Compliance Department ensures that all Enron's operations adhere to the legal standards set by regulatory bodies. Their focus on corporate governance and contract management, noted starting Q2 2000, is crucial in maintaining Enron's reputation and operational legality, especially in managing complex contracts and corporate agreements. Their efforts underscore the commitment to upholding high legal standards and ethical practices.\n\n[Data: Source (5), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
+            "explanation": "The Legal Compliance Department ensures that all Enron's operations adhere to the legal standards set by regulatory bodies. Their focus on corporate governance and contract management, noted starting Q2 2000, is crucial in maintaining Enron's reputation and operational legality, especially in managing complex contracts and corporate agreements. Their efforts underscore the commitment to upholding high legal standards and ethical practices.
+
+[Data: Source (5), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
         }},
         {{
             "summary": "Interdepartmental Collaboration for Compliance",
-            "explanation": "Collaboration between the Risk Management and Legal Compliance Departments, established in Q2 2000, ensures that risk mitigation strategies are legally sound and that compliance measures consider financial risks. This synergy is vital for holistic governance and has been instrumental in integrating risk management with legal compliance strategies at Enron. Enhanced interdepartmental cooperation during this period plays a crucial role in aligning the company's strategies with regulatory requirements.\n\n[Data: Sources (9), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
+            "explanation": "Collaboration between the Risk Management and Legal Compliance Departments, established in Q2 2000, ensures that risk mitigation strategies are legally sound and that compliance measures consider financial risks. This synergy is vital for holistic governance and has been instrumental in integrating risk management with legal compliance strategies at Enron. Enhanced interdepartmental cooperation during this period plays a crucial role in aligning the company's strategies with regulatory requirements.
+
+[Data: Sources (9), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
         }}
     ],
     "date_range": ["2000-01-01", "2000-07-12"]
@@ -92,4 +93,3 @@
 {input_text}
 
 Output:
-"""
@@ -0,0 +1,60 @@
+
+---Role---
+
+You are a helpful assistant responding to questions about data in the reports provided.
+
+---Goal---
+
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible.
+
+If you don't know the answer, just say so. Do not make anything up.
+
+Points supported by data should list their data references as follows:
+
+"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."
+
+Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
+
+For example:
+
+"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]."
+
+Do not include information where the supporting evidence for it is not provided.
+
+If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example:
+
+"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]"
+
+---Data Reports---
+
+{context_data}
+
+---Target response length and format---
+
+{response_type}
+
+
+---Goal---
+
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible.
+
+If you don't know the answer, just say so. Do not make anything up.
+
+Points supported by data should list their data references as follows:
+
+"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."
+
+Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
+
+For example:
+
+"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]."
+
+Do not include information where the supporting evidence for it is not provided.
+
+If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example:
+
+"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]".
+
+Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Now answer the following query using the data above:
+
@@ -0,0 +1,67 @@
+
+---Role---
+
+You are a helpful assistant responding to questions about data in the tables provided.
+
+
+---Goal---
+
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+
+If you don't know the answer, just say so. Do not make anything up.
+
+Points supported by data should list their data references as follows:
+
+"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."
+
+Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
+
+For example:
+
+"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]."
+
+where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.
+
+Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response.
+
+---Target response length and format---
+
+{response_type}
+
+
+---Data tables---
+
+{context_data}
+
+
+---Goal---
+
+Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.
+
+If you don't know the answer, just say so. Do not make anything up.
+
+Points supported by data should list their data references as follows:
+
+"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."
+
+Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.
+
+For example:
+
+"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]."
+
+where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.
+
+Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response.
+
+---Target response length and format---
+
+{response_type}
+
+Add sections and commentary to the response as appropriate for the length and format.
+
+Additionally provide a score between 0 and 100 representing how well the response addresses the overall research question: {global_query}. Based on your response, suggest up to five follow-up questions that could be asked to further explore the topic as it relates to the overall research question. Do not include scores or follow up questions in the 'response' field of the JSON, add them to the respective 'score' and 'follow_up_queries' keys of the JSON output. Format your response in JSON with the following keys and values:
+
+{{'response': str, Put your answer, formatted in markdown, here. Do not answer the global query in this section.
+'score': int,
+'follow_up_queries': List[str]}}
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""A file containing prompts definition."""
-
-EXTRACT_CLAIMS_PROMPT = """
 -Target activity-
 You are an intelligent assistant that helps a human analyst to analyze claims against certain entities presented in a text document.
 
@@ -54,8 +49,4 @@
 Entity specification: {entity_specs}
 Claim description: {claim_description}
 Text: {input_text}
-Output:"""
-
-
-CONTINUE_PROMPT = "MANY entities were missed in the last extraction.  Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y if there are still entities that need to be added, or N if there are none. Please answer with a single letter Y or N.\n"
+Output:
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""A file containing prompts definition."""
-
-GRAPH_EXTRACTION_PROMPT = """
 -Goal-
 Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.
 
@@ -123,7 +118,4 @@
 Entity_types: {entity_types}
 Text: {input_text}
 ######################
-Output:"""
-
-CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n"
-LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer Y if there are still entities or relationships that need to be added, or N if there are none. Please answer with a single letter Y or N.\n"
+Output:
@@ -1,9 +1,3 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""Global Search system prompts."""
-
-GENERAL_KNOWLEDGE_INSTRUCTION = """
 The response may also include relevant real-world knowledge outside the dataset, but it must be explicitly annotated with a verification tag [LLM: verify]. For example:
 "This is an example sentence supported by real-world knowledge [LLM: verify]."
-"""
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""System prompts for global search."""
-
-MAP_SYSTEM_PROMPT = """
 ---Role---
 
 You are a helpful assistant responding to questions about data in the tables provided.
@@ -82,4 +77,3 @@
         {{"description": "Description of point 2 [Data: Reports (report ids)]", "score": score_value}}
     ]
 }}
-"""
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""Global Search system prompts."""
-
-REDUCE_SYSTEM_PROMPT = """
 ---Role---
 
 You are a helpful assistant responding to questions about a dataset by synthesizing perspectives from multiple analysts.
@@ -78,8 +73,3 @@
 {response_type}
 
 Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
-"""
-
-NO_DATA_ANSWER = (
-    "I am sorry but I am unable to answer this question given the provided data."
-)
@@ -1,9 +1,4 @@
-# Copyright (c) 2024 Microsoft Corporation.
-# Licensed under the MIT License
 
-"""Local search system prompts."""
-
-LOCAL_SEARCH_SYSTEM_PROMPT = """
 ---Role---
 
 You are a helpful assistant responding to questions about data in the tables provided.
@@ -66,4 +61,3 @@
 {response_type}
 
 Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
-"""