biocompute-objects
diff --git a/‎bcorag/bcorag.py‎
Lines changed: 51 additions & 13 deletions b/‎bcorag/bcorag.py‎
Lines changed: 51 additions & 13 deletions
diff --git a/‎bcorag/custom_types/core_types.py‎
Lines changed: 6 additions & 3 deletions b/‎bcorag/custom_types/core_types.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎bcorag/prompts/__init__.py‎
Lines changed: 82 additions & 0 deletions b/‎bcorag/prompts/__init__.py‎
Lines changed: 82 additions & 0 deletions
@@ -9,7 +9,12 @@
     get_response_synthesizer,
     Response,
 )
-from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
+from llama_index.core.callbacks import (
+    CallbackManager,
+    TokenCountingHandler,
+)
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.schema import QueryBundle
 from llama_index.core.retrievers import VectorIndexRetriever
 from llama_index.core.query_engine import RetrieverQueryEngine
 from llama_index.llms.openai import OpenAI  # type: ignore
@@ -47,7 +52,13 @@
     default_output_tracker_file,
 )
 import bcorag.misc_functions as misc_fns
-from .prompts import DOMAIN_MAP, QUERY_PROMPT, SUPPLEMENT_PROMPT
+from .prompts import (
+    PROMPT_DOMAIN_MAP,
+    RETRIEVAL_PROMPT,
+    LLM_PROMPT,
+    SUPPLEMENT_PROMPT,
+    LLM_PROMPT_TEMPLATE,
+)
 
 # import llama_index.core
 # llama_index.core.set_global_handler("simple")
@@ -133,7 +144,7 @@ def __init__(
         load_dotenv()
 
         self._parameter_set_hash = self._user_selection_hash(user_selections)
-        self._domain_map = DOMAIN_MAP
+        self._domain_map = PROMPT_DOMAIN_MAP
         self._file_name = user_selections["filename"]
         self._file_path = user_selections["filepath"]
         self._output_path_root = os.path.join(
@@ -285,16 +296,24 @@ def __init__(
                     )
                     self._index = VectorStoreIndex(nodes=nodes)
 
-        retriever = VectorIndexRetriever(
-            index=self._index, similarity_top_k=self._similarity_top_k * 3
+        base_retriever = VectorIndexRetriever(
+            index=self._index,
+            similarity_top_k=self._similarity_top_k * 3,
+        )
+        # transform_retriever = TransformRetriever(
+        #     retriever=base_retriever,
+        #     query_transform=CustomQueryTransform(delimiter=DELIMITER),
+        # )
+        llm_prompt_template = PromptTemplate(template=LLM_PROMPT_TEMPLATE)
+        response_synthesizer = get_response_synthesizer(
+            text_qa_template=llm_prompt_template
         )
-        response_synthesizer = get_response_synthesizer()
         rerank_postprocessor = SentenceTransformerRerank(
             top_n=self._similarity_top_k,
             keep_retrieval_score=True,
         )
         self._query_engine = RetrieverQueryEngine(
-            retriever=retriever,
+            retriever=base_retriever,
             response_synthesizer=response_synthesizer,
             node_postprocessors=[rerank_postprocessor],
         )
@@ -322,16 +341,28 @@ def perform_query(self, domain: DomainKey) -> str:
             The generated domain.
         """
         query_start_time = time.time()
-        domain_prompt = self._domain_map[domain]["prompt"]
+        domain_retrieval_prompt = self._domain_map[domain]["retrieval_prompt"]
+        domain_llm_prompt = self._domain_map[domain]["llm_prompt"]
+
         for dependency in self._domain_map[domain]["dependencies"]:
             if self.domain_content[dependency] is not None:
                 dependency_prompt = f"The {domain} domain is dependent on the {dependency} domain. Here is the {dependency} domain: {self.domain_content[dependency]}."
-                domain_prompt += dependency_prompt
-        query_prompt = QUERY_PROMPT.format(domain, domain_prompt)
+                domain_llm_prompt += dependency_prompt
+
+        # full_prompt = f"{RETRIEVAL_PROMPT.format(domain, domain_retrieval_prompt)} {DELIMITER} {LLM_PROMPT.format(domain, domain_llm_prompt)}"
+        llm_prompt = f"{LLM_PROMPT.format(domain, domain_llm_prompt)}"
         if self._domain_map[domain]["top_level"]:
-            query_prompt += f"\n{SUPPLEMENT_PROMPT}"
+            llm_prompt += f"\n{SUPPLEMENT_PROMPT}"
+        query_bundle = QueryBundle(
+            query_str=llm_prompt,
+            custom_embedding_strs=[
+                f"{RETRIEVAL_PROMPT.format(domain, domain_retrieval_prompt)}"
+            ],
+            embedding=None,
+        )
+
+        response_object = self._query_engine.query(query_bundle)
 
-        response_object = self._query_engine.query(query_prompt)
         if isinstance(response_object, Response):
             response_object = Response(
                 response=response_object.response,
@@ -369,7 +400,14 @@ def perform_query(self, domain: DomainKey) -> str:
             source_str += "\n"
 
         if self._debug:
-            self._display_info(query_prompt, f"QUERY PROMPT for the {domain} domain:")
+            self._display_info(
+                query_bundle.query_str, f"LLM PROMPT for the {domain} domain:"
+            )
+            if query_bundle.custom_embedding_strs is not None:
+                self._display_info(
+                    query_bundle.custom_embedding_strs[0],
+                    f"RETRIEVAL PROMPT for the {domain} domain:",
+                )
             self._token_counts["input"] += self._token_counter.prompt_llm_token_count  # type: ignore
             self._token_counts["output"] += self._token_counter.completion_llm_token_count  # type: ignore
             self._token_counts["total"] += self._token_counter.total_llm_token_count  # type: ignore
 
@@ -350,8 +350,10 @@ class IndividualDomainMapEntry(TypedDict):
 
     Attributes
     ----------
-    prompt : str
-        The prompt to use for querying the RAG pipeline for a specific domain generation.
+    retrieval_prompt : str
+        The prompt to use for the RAG pipeline retrieval process.
+    llm_prompt : str
+        The prompt to use for the LLM.
     top_level : bool
         Whether the specified domain includes object's defined in the top level JSON schema.
     user_prompt : str
@@ -362,7 +364,8 @@ class IndividualDomainMapEntry(TypedDict):
         The domain dependencies.
     """
 
-    prompt: str
+    retrieval_prompt: str
+    llm_prompt: str
     top_level: bool
     user_prompt: str
     code: str
 
@@ -0,0 +1,82 @@
+from bcorag.custom_types.core_types import DomainMap
+from .retrieval import (
+    RETRIEVAL_PROMPT,
+    USABILITY_DOMAIN_RETRIEVAL,
+    IO_DOMAIN_RETRIEVAL,
+    DESCRIPTION_DOMAIN_RETRIEVAL,
+    EXECUTION_DOMAIN_RETRIEVAL,
+    PARAMETRIC_DOMAIN_RETRIEVAL,
+    ERROR_DOMAIN_RETRIEVAL,
+)
+from .llm_prompts import (
+    LLM_PROMPT,
+    USABILITY_DOMAIN_LLM,
+    IO_DOMAIN_LLM,
+    DESCRIPTION_DOMAIN_LLM,
+    EXECUTION_DOMAIN_LLM,
+    PARAMETRIC_DOMAIN_LLM,
+    ERROR_DOMAIN_LLM,
+    SUPPLEMENT_PROMPT,
+)
+
+LLM_PROMPT_TEMPLATE = """
+Below is some excerpts from a bioinformatics project. The information is from the project's publication and could also contain some information from the project's code repository.
+
+{context_str}
+
+---------\n
+
+{query_str}
+"""
+
+
+PROMPT_DOMAIN_MAP: DomainMap = {
+    "usability": {
+        "retrieval_prompt": USABILITY_DOMAIN_RETRIEVAL,
+        "llm_prompt": USABILITY_DOMAIN_LLM,
+        "top_level": False,
+        "user_prompt": "[u]sability",
+        "code": "u",
+        "dependencies": [],
+    },
+    "io": {
+        "retrieval_prompt": IO_DOMAIN_RETRIEVAL,
+        "llm_prompt": IO_DOMAIN_LLM,
+        "top_level": True,
+        "user_prompt": "[i]o",
+        "code": "i",
+        "dependencies": [],
+    },
+    "description": {
+        "retrieval_prompt": DESCRIPTION_DOMAIN_RETRIEVAL,
+        "llm_prompt": DESCRIPTION_DOMAIN_LLM,
+        "top_level": True,
+        "user_prompt": "[d]escription",
+        "code": "d",
+        "dependencies": [],
+    },
+    "execution": {
+        "retrieval_prompt": EXECUTION_DOMAIN_RETRIEVAL,
+        "llm_prompt": EXECUTION_DOMAIN_LLM,
+        "top_level": True,
+        "user_prompt": "[e]xecution",
+        "code": "e",
+        "dependencies": [],
+    },
+    "parametric": {
+        "retrieval_prompt": PARAMETRIC_DOMAIN_RETRIEVAL,
+        "llm_prompt": PARAMETRIC_DOMAIN_LLM,
+        "top_level": False,
+        "user_prompt": "[p]arametric",
+        "code": "p",
+        "dependencies": ["description"],
+    },
+    "error": {
+        "retrieval_prompt": ERROR_DOMAIN_RETRIEVAL,
+        "llm_prompt": ERROR_DOMAIN_LLM,
+        "top_level": False,
+        "user_prompt": "[err]or",
+        "code": "err",
+        "dependencies": [],
+    },
+}