Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions graphrag/index/operations/build_noun_graph/build_noun_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,21 @@ def _extract_edges(
text_units_df = text_units_df.assign(edges=all_edges) # type: ignore
edge_df = text_units_df.explode("edges")[["edges", "text_unit_id"]]

edge_df[["source", "target"]] = edge_df.loc[:, "edges"].to_list()
# convert tuples/lists into a two-column DataFrame, padding with NaNs as needed
edges_list = edge_df["edges"].apply(
lambda tup: list(tup) if isinstance(tup, (list, tuple)) else [np.nan, np.nan]
)
edges_expanded = pd.DataFrame(edges_list.tolist(), index=edge_df.index, columns=["source", "target"]).iloc[:, :2]
edge_df = pd.concat([edge_df.drop(columns=["edges"]), edges_expanded], axis=1)
edge_df = edge_df.dropna(subset=["source", "target"]) # drop any rows with invalid source & target

edge_df["min_source"] = edge_df[["source", "target"]].min(axis=1)
edge_df["max_target"] = edge_df[["source", "target"]].max(axis=1)
edge_df = edge_df.drop(columns=["source", "target"]).rename(
columns={"min_source": "source", "max_target": "target"} # type: ignore
)

edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
edge_df = edge_df.drop(columns=["edges"])
# edge_df = edge_df[(edge_df.source.notna()) & (edge_df.target.notna())]
# edge_df = edge_df.drop(columns=["edges"])
# group by source and target, count the number of text units
grouped_edge_df = (
edge_df.groupby(["source", "target"]).agg({"text_unit_id": list}).reset_index()
Expand Down
4 changes: 4 additions & 0 deletions graphrag/index/operations/graph_to_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,8 @@ def graph_to_dataframes(
if edge_columns:
edges = edges.loc[:, edge_columns]

# ndarray embeddings to be stored as list objects to avoid broadcasting errors
if "embedding" in nodes.columns and isinstance(nodes["embedding"].iloc[0], (list, tuple)) is False:
nodes["embedding"] = nodes["embedding"].apply(lambda x: x.tolist() if hasattr(x, "tolist") else x)

return (nodes, edges)
4 changes: 4 additions & 0 deletions graphrag/index/operations/prune_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ def prune_graph(
# remove ego nodes if needed
degree = cast("DegreeView", graph.degree)
degrees = list(degree()) # type: ignore
# empty graph with nothing to prune
if not degrees:
return graph

if remove_ego_nodes:
# ego node is one with highest degree
ego_node = max(degrees, key=lambda x: x[1])
Expand Down
6 changes: 5 additions & 1 deletion graphrag/index/workflows/prune_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,12 @@ def prune_graph(

# subset the full nodes and edges to only include the pruned remainders
subset_entities = pruned_nodes.merge(entities, on="title", how="inner")
pruned_edges["source"] = pruned_edges["source"].astype(str)
pruned_edges["target"] = pruned_edges["target"].astype(str)
entities["title"] = entities["title"].astype(str)

subset_relationships = pruned_edges.merge(
relationships, on=["source", "target"], how="inner"
entities, how="inner", left_on="source", right_on="title"
)

return (subset_entities, subset_relationships)
4 changes: 0 additions & 4 deletions graphrag/prompts/__init__.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Basic Search prompts."""

BASIC_SEARCH_SYSTEM_PROMPT = """
---Role---

You are a helpful assistant responding to questions about data in the tables provided.
Expand Down Expand Up @@ -70,4 +65,3 @@
{response_type}

Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
"""
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License
"""A file containing prompts definition."""

COMMUNITY_REPORT_PROMPT = """
You are an AI assistant that helps a human analyst to perform general information discovery. Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.

# Goal
Expand Down Expand Up @@ -150,4 +146,4 @@

Limit the total report length to {max_report_length} words.

Output:"""
Output:
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""A file containing prompts definition."""

COMMUNITY_REPORT_TEXT_PROMPT = """
You are an AI assistant that helps a human analyst to perform general information discovery.
Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.

Expand Down Expand Up @@ -69,15 +64,21 @@
"findings": [
{{
"summary": "Risk Management Operational Scope",
"explanation": "The Risk Management Department at Enron plays a pivotal role in identifying, assessing, and mitigating financial risks. Their proactive approach, highlighted from the beginning of 2000, helps safeguard Enron against potential financial pitfalls and ensures continuous compliance with evolving market regulations. Effective risk management not only prevents financial anomalies but also supports the company's strategic decision-making processes.\n\n[Data: Sources (2, 3), Date_Range ((2000, 01, 01), (2000, 07, 12))]"
"explanation": "The Risk Management Department at Enron plays a pivotal role in identifying, assessing, and mitigating financial risks. Their proactive approach, highlighted from the beginning of 2000, helps safeguard Enron against potential financial pitfalls and ensures continuous compliance with evolving market regulations. Effective risk management not only prevents financial anomalies but also supports the company's strategic decision-making processes.

[Data: Sources (2, 3), Date_Range ((2000, 01, 01), (2000, 07, 12))]"
}},
{{
"summary": "Legal Compliance and Governance",
"explanation": "The Legal Compliance Department ensures that all Enron's operations adhere to the legal standards set by regulatory bodies. Their focus on corporate governance and contract management, noted starting Q2 2000, is crucial in maintaining Enron's reputation and operational legality, especially in managing complex contracts and corporate agreements. Their efforts underscore the commitment to upholding high legal standards and ethical practices.\n\n[Data: Source (5), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
"explanation": "The Legal Compliance Department ensures that all Enron's operations adhere to the legal standards set by regulatory bodies. Their focus on corporate governance and contract management, noted starting Q2 2000, is crucial in maintaining Enron's reputation and operational legality, especially in managing complex contracts and corporate agreements. Their efforts underscore the commitment to upholding high legal standards and ethical practices.

[Data: Source (5), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
}},
{{
"summary": "Interdepartmental Collaboration for Compliance",
"explanation": "Collaboration between the Risk Management and Legal Compliance Departments, established in Q2 2000, ensures that risk mitigation strategies are legally sound and that compliance measures consider financial risks. This synergy is vital for holistic governance and has been instrumental in integrating risk management with legal compliance strategies at Enron. Enhanced interdepartmental cooperation during this period plays a crucial role in aligning the company's strategies with regulatory requirements.\n\n[Data: Sources (9), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
"explanation": "Collaboration between the Risk Management and Legal Compliance Departments, established in Q2 2000, ensures that risk mitigation strategies are legally sound and that compliance measures consider financial risks. This synergy is vital for holistic governance and has been instrumental in integrating risk management with legal compliance strategies at Enron. Enhanced interdepartmental cooperation during this period plays a crucial role in aligning the company's strategies with regulatory requirements.

[Data: Sources (9), Date_Range ((2000, 04, 01), (2000, 07, 12))]"
}}
],
"date_range": ["2000-01-01", "2000-07-12"]
Expand All @@ -92,4 +93,3 @@
{input_text}

Output:
"""
60 changes: 60 additions & 0 deletions graphrag/prompts/drift_reduce_prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@

---Role---

You are a helpful assistant responding to questions about data in the reports provided.

---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible.

If you don't know the answer, just say so. Do not make anything up.

Points supported by data should list their data references as follows:

"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."

Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.

For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]."

Do not include information where the supporting evidence for it is not provided.

If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]"

---Data Reports---

{context_data}

---Target response length and format---

{response_type}


---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input reports appropriate for the response length and format, and incorporating any relevant general knowledge while being as specific, accurate and concise as possible.

If you don't know the answer, just say so. Do not make anything up.

Points supported by data should list their data references as follows:

"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."

Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.

For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (1, 5, 15)]."

Do not include information where the supporting evidence for it is not provided.

If you decide to use general knowledge, you should add a delimiter stating that the information is not supported by the data tables. For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing. [Data: General Knowledge (href)]".

Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown. Now answer the following query using the data above:

67 changes: 67 additions & 0 deletions graphrag/prompts/drift_search_system_prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@

---Role---

You are a helpful assistant responding to questions about data in the tables provided.


---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.

If you don't know the answer, just say so. Do not make anything up.

Points supported by data should list their data references as follows:

"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."

Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.

For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]."

where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.

Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response.

---Target response length and format---

{response_type}


---Data tables---

{context_data}


---Goal---

Generate a response of the target length and format that responds to the user's question, summarizing all information in the input data tables appropriate for the response length and format, and incorporating any relevant general knowledge.

If you don't know the answer, just say so. Do not make anything up.

Points supported by data should list their data references as follows:

"This is an example sentence supported by multiple data references [Data: <dataset name> (record ids); <dataset name> (record ids)]."

Do not list more than 5 record ids in a single reference. Instead, list the top 5 most relevant record ids and add "+more" to indicate that there are more.

For example:

"Person X is the owner of Company Y and subject to many allegations of wrongdoing [Data: Sources (15, 16)]."

where 15, 16, 1, 5, 7, 23, 2, 7, 34, 46, and 64 represent the id (not the index) of the relevant data record.

Pay close attention specifically to the Sources tables as it contains the most relevant information for the user query. You will be rewarded for preserving the context of the sources in your response.

---Target response length and format---

{response_type}

Add sections and commentary to the response as appropriate for the length and format.

Additionally provide a score between 0 and 100 representing how well the response addresses the overall research question: {global_query}. Based on your response, suggest up to five follow-up questions that could be asked to further explore the topic as it relates to the overall research question. Do not include scores or follow up questions in the 'response' field of the JSON, add them to the respective 'score' and 'follow_up_queries' keys of the JSON output. Format your response in JSON with the following keys and values:

{{'response': str, Put your answer, formatted in markdown, here. Do not answer the global query in this section.
'score': int,
'follow_up_queries': List[str]}}
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""A file containing prompts definition."""

EXTRACT_CLAIMS_PROMPT = """
-Target activity-
You are an intelligent assistant that helps a human analyst to analyze claims against certain entities presented in a text document.

Expand Down Expand Up @@ -54,8 +49,4 @@
Entity specification: {entity_specs}
Claim description: {claim_description}
Text: {input_text}
Output:"""


CONTINUE_PROMPT = "MANY entities were missed in the last extraction. Add them below using the same format:\n"
LOOP_PROMPT = "It appears some entities may have still been missed. Answer Y if there are still entities that need to be added, or N if there are none. Please answer with a single letter Y or N.\n"
Output:
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""A file containing prompts definition."""

GRAPH_EXTRACTION_PROMPT = """
-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.

Expand Down Expand Up @@ -123,7 +118,4 @@
Entity_types: {entity_types}
Text: {input_text}
######################
Output:"""

CONTINUE_PROMPT = "MANY entities and relationships were missed in the last extraction. Remember to ONLY emit entities that match any of the previously extracted types. Add them below using the same format:\n"
LOOP_PROMPT = "It appears some entities and relationships may have still been missed. Answer Y if there are still entities or relationships that need to be added, or N if there are none. Please answer with a single letter Y or N.\n"
Output:
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Global Search system prompts."""

GENERAL_KNOWLEDGE_INSTRUCTION = """
The response may also include relevant real-world knowledge outside the dataset, but it must be explicitly annotated with a verification tag [LLM: verify]. For example:
"This is an example sentence supported by real-world knowledge [LLM: verify]."
"""
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""System prompts for global search."""

MAP_SYSTEM_PROMPT = """
---Role---

You are a helpful assistant responding to questions about data in the tables provided.
Expand Down Expand Up @@ -82,4 +77,3 @@
{{"description": "Description of point 2 [Data: Reports (report ids)]", "score": score_value}}
]
}}
"""
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Global Search system prompts."""

REDUCE_SYSTEM_PROMPT = """
---Role---

You are a helpful assistant responding to questions about a dataset by synthesizing perspectives from multiple analysts.
Expand Down Expand Up @@ -78,8 +73,3 @@
{response_type}

Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
"""

NO_DATA_ANSWER = (
"I am sorry but I am unable to answer this question given the provided data."
)
4 changes: 0 additions & 4 deletions graphrag/prompts/index/__init__.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Local search system prompts."""

LOCAL_SEARCH_SYSTEM_PROMPT = """
---Role---

You are a helpful assistant responding to questions about data in the tables provided.
Expand Down Expand Up @@ -66,4 +61,3 @@
{response_type}

Add sections and commentary to the response as appropriate for the length and format. Style the response in markdown.
"""
4 changes: 0 additions & 4 deletions graphrag/prompts/query/__init__.py

This file was deleted.

Loading