diff --git a/graphrag/index/operations/build_noun_graph/build_noun_graph.py b/graphrag/index/operations/build_noun_graph/build_noun_graph.py index 0e45e351ac..9a447028df 100644 --- a/graphrag/index/operations/build_noun_graph/build_noun_graph.py +++ b/graphrag/index/operations/build_noun_graph/build_noun_graph.py @@ -108,15 +108,21 @@ def _extract_edges( text_units_df = text_units_df.assign(edges=all_edges) # type: ignore edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]] - edge_df[["source", "target"]] = edge_df.loc[:, "edges"].to_list() + # convert tuples/lists into a two-column DataFrame, padding with NaNs as needed + edges_list = edge_df["edges"].apply( + lambda tup: list(tup) if isinstance(tup, (list, tuple)) else [np.nan, np.nan] + ) + edges_expanded = pd.DataFrame(edges_list.tolist(), index=edge_df.index, columns=["source", "target"]).iloc[:, :2] + edge_df = pd.concat([edge_df.drop(columns=["edges"]), edges_expanded], axis=1) + edge_df = edge_df.dropna(subset=["source", "target"]) # drop any rows with invalid source & target + edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1) edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1) edge_df = edge_df.drop(columns=["source", "target"]).rename( columns={"min_source": "source", "max_target": "target"} # type: ignore ) - - edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] - edge_df = edge_df.drop(columns=["edges"]) + # edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())] + # edge_df = edge_df.drop(columns=["edges"]) # group by source and target, count the number of text units grouped_edge_df = ( edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index() diff --git a/graphrag/index/operations/graph_to_dataframes.py b/graphrag/index/operations/graph_to_dataframes.py index dbc608f640..10ede1e760 100644 --- a/graphrag/index/operations/graph_to_dataframes.py +++ b/graphrag/index/operations/graph_to_dataframes.py @@ -35,4 +35,8 @@ def graph_to_dataframes( if edge_columns: edges = edges.loc[:, edge_columns] + # ndarray embeddings to be stored as list objects to avoid broadcasting errors + if "embedding" in nodes.columns and isinstance(nodes["embedding"].iloc[0], (list, tuple)) is False: + nodes["embedding"] = nodes["embedding"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else x) + return (nodes, edges) diff --git a/graphrag/index/operations/prune_graph.py b/graphrag/index/operations/prune_graph.py index d826558584..edf5ba8df9 100644 --- a/graphrag/index/operations/prune_graph.py +++ b/graphrag/index/operations/prune_graph.py @@ -29,6 +29,10 @@ def prune_graph( # remove ego nodes if needed degree = cast("DegreeView", graph.degree) degrees = list(degree()) # type: ignore + # empty graph with nothing to prune + if not degrees: + return graph + if remove_ego_nodes: # ego node is one with highest degree ego_node = max(degrees, key=lambda x: x[1]) diff --git a/graphrag/index/workflows/prune_graph.py b/graphrag/index/workflows/prune_graph.py index 52987a8df8..05ea054d30 100644 --- a/graphrag/index/workflows/prune_graph.py +++ b/graphrag/index/workflows/prune_graph.py @@ -69,8 +69,12 @@ def prune_graph( # subset the full nodes and edges to only include the pruned remainders subset_entities = pruned_nodes.merge(entities, on="title", how="inner") + pruned_edges["source"] = pruned_edges["source"].astype(str) + pruned_edges["target"] = pruned_edges["target"].astype(str) + entities["title"] = entities["title"].astype(str) + subset_relationships = pruned_edges.merge( - relationships, on=["source", "target"], how="inner" + entities, how="inner", left_on="source", right_on="title" ) return (subset_entities, subset_relationships) diff --git a/graphrag/prompts/__init__.py b/graphrag/prompts/__init__.py deleted file mode 100644 index 3bb0594c36..0000000000 --- a/graphrag/prompts/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""All prompts for the GraphRAG system.""" diff --git a/graphrag/prompts/query/basic_search_system_prompt.py b/graphrag/prompts/basic_search_system_prompt.txt similarity index 95% rename from graphrag/prompts/query/basic_search_system_prompt.py rename to graphrag/prompts/basic_search_system_prompt.txt index a20fb6ad10..eb2afc42b8 100644 --- a/graphrag/prompts/query/basic_search_system_prompt.py +++ b/graphrag/prompts/basic_search_system_prompt.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""Basic Search prompts.""" - -BASIC_SEARCH_SYSTEM_PROMPT = """ ---Role--- You are a helpful assistant responding to questions about data in the tables provided. @@ -70,4 +65,3 @@ {response_type} Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" diff --git a/graphrag/prompts/index/community_report.py b/graphrag/prompts/community_report_graph.txt similarity index 98% rename from graphrag/prompts/index/community_report.py rename to graphrag/prompts/community_report_graph.txt index c3a7702ba0..595e7a3e86 100644 --- a/graphrag/prompts/index/community_report.py +++ b/graphrag/prompts/community_report_graph.txt @@ -1,8 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""A file containing prompts definition.""" -COMMUNITY_REPORT_PROMPT = """ You are an AI assistant that helps a human analyst to perform general information discovery. Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network. # Goal @@ -150,4 +146,4 @@ Limit the total report length to {max_report_length} words. -Output:""" +Output: \ No newline at end of file diff --git a/graphrag/prompts/index/community_report_text_units.py b/graphrag/prompts/community_report_text.txt similarity index 94% rename from graphrag/prompts/index/community_report_text_units.py rename to graphrag/prompts/community_report_text.txt index 47fcd29c09..cfe41eff5f 100644 --- a/graphrag/prompts/index/community_report_text_units.py +++ b/graphrag/prompts/community_report_text.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""A file containing prompts definition.""" - -COMMUNITY_REPORT_TEXT_PROMPT = """ You are an AI assistant that helps a human analyst to perform general information discovery. Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network. @@ -69,15 +64,21 @@ "findings": [ {{ "summary": "Risk Management Operational Scope", - "explanation": "The Risk Management Department at Enron plays a pivotal role in identifying, assessing, and mitigating financial risks. Their proactive approach, highlighted from the beginning of 2000, helps safeguard Enron against potential financial pitfalls and ensures continuous compliance with evolving market regulations. Effective risk management not only prevents financial anomalies but also supports the company's strategic decision-making processes.\n\n[Data: Sources (2, 3), Date_Range ((2000, 01, 01), (2000, 07, 12))]" + "explanation": "The Risk Management Department at Enron plays a pivotal role in identifying, assessing, and mitigating financial risks. Their proactive approach, highlighted from the beginning of 2000, helps safeguard Enron against potential financial pitfalls and ensures continuous compliance with evolving market regulations. Effective risk management not only prevents financial anomalies but also supports the company's strategic decision-making processes. + +[Data: Sources (2, 3), Date_Range ((2000, 01, 01), (2000, 07, 12))]" }}, {{ "summary": "Legal Compliance and Governance", - "explanation": "The Legal Compliance Department ensures that all Enron's operations adhere to the legal standards set by regulatory bodies. Their focus on corporate governance and contract management, noted starting Q2 2000, is crucial in maintaining Enron's reputation and operational legality, especially in managing complex contracts and corporate agreements. Their efforts underscore the commitment to upholding high legal standards and ethical practices.\n\n[Data: Source (5), Date_Range ((2000, 04, 01), (2000, 07, 12))]" + "explanation": "The Legal Compliance Department ensures that all Enron's operations adhere to the legal standards set by regulatory bodies. Their focus on corporate governance and contract management, noted starting Q2 2000, is crucial in maintaining Enron's reputation and operational legality, especially in managing complex contracts and corporate agreements. Their efforts underscore the commitment to upholding high legal standards and ethical practices. + +[Data: Source (5), Date_Range ((2000, 04, 01), (2000, 07, 12))]" }}, {{ "summary": "Interdepartmental Collaboration for Compliance", - "explanation": "Collaboration between the Risk Management and Legal Compliance Departments, established in Q2 2000, ensures that risk mitigation strategies are legally sound and that compliance measures consider financial risks. This synergy is vital for holistic governance and has been instrumental in integrating risk management with legal compliance strategies at Enron. Enhanced interdepartmental cooperation during this period plays a crucial role in aligning the company's strategies with regulatory requirements.\n\n[Data: Sources (9), Date_Range ((2000, 04, 01), (2000, 07, 12))]" + "explanation": "Collaboration between the Risk Management and Legal Compliance Departments, established in Q2 2000, ensures that risk mitigation strategies are legally sound and that compliance measures consider financial risks. This synergy is vital for holistic governance and has been instrumental in integrating risk management with legal compliance strategies at Enron. Enhanced interdepartmental cooperation during this period plays a crucial role in aligning the company's strategies with regulatory requirements. + +[Data: Sources (9), Date_Range ((2000, 04, 01), (2000, 07, 12))]" }} ], "date_range": ["2000-01-01", "2000-07-12"] @@ -92,4 +93,3 @@ {input_text} Output: -""" diff --git a/graphrag/prompts/drift_reduce_prompt.txt b/graphrag/prompts/drift_reduce_prompt.txt new file mode 100644 index 0000000000..88c9cfa17b --- /dev/null +++ b/graphrag/prompts/drift_reduce_prompt.txt @@ -0,0 +1,60 @@ + +---Role--- + +You are a helpful assistant responding to questions about data in the reports provided. + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible. + +If you don't know the answer, just say so. Do not make anything up. + +Points supported by data should list their data references as follows: + +"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." + +Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. + +For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]." + +Do not include information where the supporting evidence for it is not provided. + +If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]" + +---Data Reports--- + +{context_data} + +---Target response length and format--- + +{response_type} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible. + +If you don't know the answer, just say so. Do not make anything up. + +Points supported by data should list their data references as follows: + +"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." + +Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. + +For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]." + +Do not include information where the supporting evidence for it is not provided. + +If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]". + +Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Now answer the following query using the data above: + diff --git a/graphrag/prompts/drift_search_system_prompt.txt b/graphrag/prompts/drift_search_system_prompt.txt new file mode 100644 index 0000000000..eb3d6544e7 --- /dev/null +++ b/graphrag/prompts/drift_search_system_prompt.txt @@ -0,0 +1,67 @@ + +---Role--- + +You are a helpful assistant responding to questions about data in the tables provided. + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Points supported by data should list their data references as follows: + +"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." + +Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. + +For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]." + +where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. + +Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response. + +---Target response length and format--- + +{response_type} + + +---Data tables--- + +{context_data} + + +---Goal--- + +Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. + +If you don't know the answer, just say so. Do not make anything up. + +Points supported by data should list their data references as follows: + +"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." + +Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. + +For example: + +"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]." + +where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. + +Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response. + +---Target response length and format--- + +{response_type} + +Add sections and commentary to the response as appropriate for the length and format. + +Additionally provide a score between 0 and 100 representing how well the response addresses the overall research question: {global_query}. Based on your response, suggest up to five follow-up questions that could be asked to further explore the topic as it relates to the overall research question. Do not include scores or follow up questions in the 'response' field of the JSON, add them to the respective 'score' and 'follow_up_queries' keys of the JSON output. Format your response in JSON with the following keys and values: + +{{'response': str, Put your answer, formatted in markdown, here. Do not answer the global query in this section. +'score': int, +'follow_up_queries': List[str]}} diff --git a/graphrag/prompts/index/extract_claims.py b/graphrag/prompts/extract_claims.txt similarity index 91% rename from graphrag/prompts/index/extract_claims.py rename to graphrag/prompts/extract_claims.txt index 5e0e5570c6..0b795c3465 100644 --- a/graphrag/prompts/index/extract_claims.py +++ b/graphrag/prompts/extract_claims.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""A file containing prompts definition.""" - -EXTRACT_CLAIMS_PROMPT = """ -Target activity- You are an intelligent assistant that helps a human analyst to analyze claims against certain entities presented in a text document. @@ -54,8 +49,4 @@ Entity specification: {entity_specs} Claim description: {claim_description} Text: {input_text} -Output:""" - - -CONTINUE_PROMPT = "MANY entities were missed in the last extraction. Add them below using the same format:\n" -LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y if there are still entities that need to be added, or N if there are none. Please answer with a single letter Y or N.\n" +Output: \ No newline at end of file diff --git a/graphrag/prompts/index/extract_graph.py b/graphrag/prompts/extract_graph.txt similarity index 93% rename from graphrag/prompts/index/extract_graph.py rename to graphrag/prompts/extract_graph.txt index a94b36142e..c0edeba3e4 100644 --- a/graphrag/prompts/index/extract_graph.py +++ b/graphrag/prompts/extract_graph.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""A file containing prompts definition.""" - -GRAPH_EXTRACTION_PROMPT = """ -Goal- Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. @@ -123,7 +118,4 @@ Entity_types: {entity_types} Text: {input_text} ###################### -Output:""" - -CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n" -LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer Y if there are still entities or relationships that need to be added, or N if there are none. Please answer with a single letter Y or N.\n" +Output: \ No newline at end of file diff --git a/graphrag/prompts/query/global_search_knowledge_system_prompt.py b/graphrag/prompts/global_search_knowledge_system_prompt.txt similarity index 61% rename from graphrag/prompts/query/global_search_knowledge_system_prompt.py rename to graphrag/prompts/global_search_knowledge_system_prompt.txt index 9125ef31f2..e69a829f4d 100644 --- a/graphrag/prompts/query/global_search_knowledge_system_prompt.py +++ b/graphrag/prompts/global_search_knowledge_system_prompt.txt @@ -1,9 +1,3 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""Global Search system prompts.""" - -GENERAL_KNOWLEDGE_INSTRUCTION = """ The response may also include relevant real-world knowledge outside the dataset, but it must be explicitly annotated with a verification tag [LLM: verify]. For example: "This is an example sentence supported by real-world knowledge [LLM: verify]." -""" diff --git a/graphrag/prompts/query/global_search_map_system_prompt.py b/graphrag/prompts/global_search_map_system_prompt.txt similarity index 96% rename from graphrag/prompts/query/global_search_map_system_prompt.py rename to graphrag/prompts/global_search_map_system_prompt.txt index 02e98f9daa..5fc61f6093 100644 --- a/graphrag/prompts/query/global_search_map_system_prompt.py +++ b/graphrag/prompts/global_search_map_system_prompt.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""System prompts for global search.""" - -MAP_SYSTEM_PROMPT = """ ---Role--- You are a helpful assistant responding to questions about data in the tables provided. @@ -82,4 +77,3 @@ {{"description": "Description of point 2 [Data: Reports (report ids)]", "score": score_value}} ] }} -""" diff --git a/graphrag/prompts/query/global_search_reduce_system_prompt.py b/graphrag/prompts/global_search_reduce_system_prompt.txt similarity index 93% rename from graphrag/prompts/query/global_search_reduce_system_prompt.py rename to graphrag/prompts/global_search_reduce_system_prompt.txt index 01bf455237..4718ee90ef 100644 --- a/graphrag/prompts/query/global_search_reduce_system_prompt.py +++ b/graphrag/prompts/global_search_reduce_system_prompt.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""Global Search system prompts.""" - -REDUCE_SYSTEM_PROMPT = """ ---Role--- You are a helpful assistant responding to questions about a dataset by synthesizing perspectives from multiple analysts. @@ -78,8 +73,3 @@ {response_type} Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" - -NO_DATA_ANSWER = ( - "I am sorry but I am unable to answer this question given the provided data." -) diff --git a/graphrag/prompts/index/__init__.py b/graphrag/prompts/index/__init__.py deleted file mode 100644 index f7216c03af..0000000000 --- a/graphrag/prompts/index/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""All prompts for the indexing engine.""" diff --git a/graphrag/prompts/query/local_search_system_prompt.py b/graphrag/prompts/local_search_system_prompt.txt similarity index 94% rename from graphrag/prompts/query/local_search_system_prompt.py rename to graphrag/prompts/local_search_system_prompt.txt index 70b1d12fc3..0c20dd8ad6 100644 --- a/graphrag/prompts/query/local_search_system_prompt.py +++ b/graphrag/prompts/local_search_system_prompt.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""Local search system prompts.""" - -LOCAL_SEARCH_SYSTEM_PROMPT = """ ---Role--- You are a helpful assistant responding to questions about data in the tables provided. @@ -66,4 +61,3 @@ {response_type} Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. -""" diff --git a/graphrag/prompts/query/__init__.py b/graphrag/prompts/query/__init__.py deleted file mode 100644 index 8e8ef1e872..0000000000 --- a/graphrag/prompts/query/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""All prompts for the query engine.""" diff --git a/graphrag/prompts/query/drift_search_system_prompt.py b/graphrag/prompts/query/drift_search_system_prompt.py deleted file mode 100644 index 3faae89a0e..0000000000 --- a/graphrag/prompts/query/drift_search_system_prompt.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""DRIFT Search prompts.""" - -DRIFT_LOCAL_SYSTEM_PROMPT = """ ----Role--- - -You are a helpful assistant responding to questions about data in the tables provided. - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. - -If you don't know the answer, just say so. Do not make anything up. - -Points supported by data should list their data references as follows: - -"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." - -Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. - -For example: - -"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]." - -where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. - -Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response. - ----Target response length and format--- - -{response_type} - - ----Data tables--- - -{context_data} - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge. - -If you don't know the answer, just say so. Do not make anything up. - -Points supported by data should list their data references as follows: - -"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." - -Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. - -For example: - -"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]." - -where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record. - -Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response. - ----Target response length and format--- - -{response_type} - -Add sections and commentary to the response as appropriate for the length and format. - -Additionally provide a score between 0 and 100 representing how well the response addresses the overall research question: {global_query}. Based on your response, suggest up to five follow-up questions that could be asked to further explore the topic as it relates to the overall research question. Do not include scores or follow up questions in the 'response' field of the JSON, add them to the respective 'score' and 'follow_up_queries' keys of the JSON output. Format your response in JSON with the following keys and values: - -{{'response': str, Put your answer, formatted in markdown, here. Do not answer the global query in this section. -'score': int, -'follow_up_queries': List[str]}} -""" - - -DRIFT_REDUCE_PROMPT = """ ----Role--- - -You are a helpful assistant responding to questions about data in the reports provided. - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible. - -If you don't know the answer, just say so. Do not make anything up. - -Points supported by data should list their data references as follows: - -"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." - -Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. - -For example: - -"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]." - -Do not include information where the supporting evidence for it is not provided. - -If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example: - -"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]" - ----Data Reports--- - -{context_data} - ----Target response length and format--- - -{response_type} - - ----Goal--- - -Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible. - -If you don't know the answer, just say so. Do not make anything up. - -Points supported by data should list their data references as follows: - -"This is an example sentence supported by multiple data references [Data: (record ids); (record ids)]." - -Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more. - -For example: - -"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]." - -Do not include information where the supporting evidence for it is not provided. - -If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example: - -"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]". - -Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Now answer the following query using the data above: - -""" - - -DRIFT_PRIMER_PROMPT = """You are a helpful agent designed to reason over a knowledge graph in response to a user query. -This is a unique knowledge graph where edges are freeform text rather than verb operators. You will begin your reasoning looking at a summary of the content of the most relevant communites and will provide: - -1. score: How well the intermediate answer addresses the query. A score of 0 indicates a poor, unfocused answer, while a score of 100 indicates a highly focused, relevant answer that addresses the query in its entirety. - -2. intermediate_answer: This answer should match the level of detail and length found in the community summaries. The intermediate answer should be exactly 2000 characters long. This must be formatted in markdown and must begin with a header that explains how the following text is related to the query. - -3. follow_up_queries: A list of follow-up queries that could be asked to further explore the topic. These should be formatted as a list of strings. Generate at least five good follow-up queries. - -Use this information to help you decide whether or not you need more information about the entities mentioned in the report. You may also use your general knowledge to think of entities which may help enrich your answer. - -You will also provide a full answer from the content you have available. Use the data provided to generate follow-up queries to help refine your search. Do not ask compound questions, for example: "What is the market cap of Apple and Microsoft?". Use your knowledge of the entity distribution to focus on entity types that will be useful for searching a broad area of the knowledge graph. - -For the query: - -{query} - -The top-ranked community summaries: - -{community_reports} - -Provide the intermediate answer, and all scores in JSON format following: - -{{'intermediate_answer': str, -'score': int, -'follow_up_queries': List[str]}} - -Begin: -""" diff --git a/graphrag/prompts/query/question_gen_system_prompt.py b/graphrag/prompts/question_gen_system_prompt.txt similarity index 83% rename from graphrag/prompts/query/question_gen_system_prompt.py rename to graphrag/prompts/question_gen_system_prompt.txt index 904ede2435..f87287d5f5 100644 --- a/graphrag/prompts/query/question_gen_system_prompt.py +++ b/graphrag/prompts/question_gen_system_prompt.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""Question Generation system prompts.""" - -QUESTION_SYSTEM_PROMPT = """ ---Role--- You are a helpful assistant generating a bulleted list of {question_count} questions about data in the tables provided. @@ -25,4 +20,3 @@ If the user's questions reference several named entities, then each candidate question should reference all named entities. ---Example questions--- -""" diff --git a/graphrag/prompts/index/summarize_descriptions.py b/graphrag/prompts/summarize_descriptions.txt similarity index 83% rename from graphrag/prompts/index/summarize_descriptions.py rename to graphrag/prompts/summarize_descriptions.txt index 4a916195bf..ab0262fedb 100644 --- a/graphrag/prompts/index/summarize_descriptions.py +++ b/graphrag/prompts/summarize_descriptions.txt @@ -1,9 +1,4 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License -"""A file containing prompts definition.""" - -SUMMARIZE_PROMPT = """ You are a helpful assistant responsible for generating a comprehensive summary of the data provided below. Given one or more entities, and a list of descriptions, all related to the same entity or group of entities. Please concatenate all of these into a single, comprehensive description. Make sure to include information collected from all the descriptions. @@ -17,4 +12,3 @@ Description List: {description_list} ####### Output: -""" diff --git a/tests/unit/indexing/graph/test_small_corpus_bug.py b/tests/unit/indexing/graph/test_small_corpus_bug.py new file mode 100644 index 0000000000..4f76d068b8 --- /dev/null +++ b/tests/unit/indexing/graph/test_small_corpus_bug.py @@ -0,0 +1,17 @@ +import pandas as pd +from graphrag.index.operations.build_noun_graph.build_noun_graph import _extract_edges + +def test_edges_expand_with_singletons(): + + df = pd.DataFrame( + { + "title": ["foo", "bar"], + "frequency": [1, 1], + "text_unit_ids": [[1], [1]], + } + ) + edges = _extract_edges(df, normalize_edge_weights=False) + + assert len(edges) == 1 + assert set(edges.columns) == {"source", "target", "weight", "text_unit_ids"} + assert {"foo", "bar"} == set(edges.loc[0, ["source", "target"]]) \ No newline at end of file