Merge branch 'SakanaAI:main' into main

Shivamkak19 · web-flow · commit 17620ad7cbd0 · 2025-01-21T13:47:51.000-08:00
diff --git a/README.md b/README.md
@@ -118,14 +118,20 @@ export VERTEXAI_LOCATION="REGION"         # for Aider/LiteLLM call
 export VERTEXAI_PROJECT="PROJECT_ID"      # for Aider/LiteLLM call
 ```
 
-#### DeepSeek API (DeepSeek-Coder-V2)
-
+#### DeepSeek API (deepseek-chat, deepseek-reasoner)
 By default, this uses the `DEEPSEEK_API_KEY` environment variable.
 
 #### OpenRouter API (Llama3.1)
 
 By default, this uses the `OPENROUTER_API_KEY` environment variable.
 
+#### Google Gemini
+We support Google Gemini models (e.g., "gemini-1.5-flash", "gemini-1.5-pro") via the [google-generativeai](https://pypi.org/project/google-generativeai) Python library. By default, it uses the environment variable:
+
+```bash
+export GEMINI_API_KEY="YOUR GEMINI API KEY"
+```
+
 #### Semantic Scholar API (Literature Search)
 
 Our code can also optionally use a Semantic Scholar API Key (`S2_API_KEY`) for higher throughput [if you have one](https://www.semanticscholar.org/product/api), though it should work without it in principle. If you have problems with Semantic Scholar, you can skip the literature search and citation phases of paper generation.
diff --git a/ai_scientist/llm.py b/ai_scientist/llm.py
@@ -5,19 +5,23 @@
 import anthropic
 import backoff
 import openai
+import google.generativeai as genai
+from google.generativeai.types import GenerationConfig
 
 MAX_NUM_TOKENS = 4096
 
 AVAILABLE_LLMS = [
+    # Anthropic models
     "claude-3-5-sonnet-20240620",
     "claude-3-5-sonnet-20241022",
+    # OpenAI models
     "gpt-4o-mini-2024-07-18",
     "gpt-4o-2024-05-13",
     "gpt-4o-2024-08-06",
     "o1-preview-2024-09-12",
     "o1-mini-2024-09-12",
     "o1-2024-12-17",
-    "deepseek-coder-v2-0724",
+    # OpenRouter models
     "llama3.1-405b",
     # Anthropic Claude models via Amazon Bedrock
     "bedrock/anthropic.claude-3-sonnet-20240229-v1:0",
@@ -31,6 +35,13 @@
     "vertex_ai/claude-3-5-sonnet-v2@20241022",
     "vertex_ai/claude-3-sonnet@20240229",
     "vertex_ai/claude-3-haiku@20240307",
+    # DeepSeek models
+    "deepseek-chat",
+    "deepseek-coder",
+    "deepseek-reasoner",
+    # Google Gemini models
+    "gemini-1.5-flash",
+    "gemini-1.5-pro",
 ]
 
 
@@ -104,7 +115,6 @@ def get_batch_responses_from_llm(
             new_msg_history.append(hist)
 
     if print_debug:
-        # Just print the first one.
         print()
         print("*" * 20 + " LLM START " + "*" * 20)
         for j, msg in enumerate(new_msg_history[0]):
@@ -191,15 +201,14 @@ def get_response_from_llm(
             temperature=1,
             max_completion_tokens=MAX_NUM_TOKENS,
             n=1,
-            #stop=None,
             seed=0,
         )
         content = response.choices[0].message.content
         new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
-    elif model == "deepseek-coder-v2-0724":
+    elif model in ["meta-llama/llama-3.1-405b-instruct", "llama-3-1-405b-instruct"]:
         new_msg_history = msg_history + [{"role": "user", "content": msg}]
         response = client.chat.completions.create(
-            model="deepseek-coder",
+            model="meta-llama/llama-3.1-405b-instruct",
             messages=[
                 {"role": "system", "content": system_message},
                 *new_msg_history,
@@ -211,10 +220,10 @@ def get_response_from_llm(
         )
         content = response.choices[0].message.content
         new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
-    elif model in ["meta-llama/llama-3.1-405b-instruct", "llama-3-1-405b-instruct"]:
+    elif model in ["deepseek-chat", "deepseek-coder"]:
         new_msg_history = msg_history + [{"role": "user", "content": msg}]
         response = client.chat.completions.create(
-            model="meta-llama/llama-3.1-405b-instruct",
+            model=model,
             messages=[
                 {"role": "system", "content": system_message},
                 *new_msg_history,
@@ -226,6 +235,34 @@ def get_response_from_llm(
         )
         content = response.choices[0].message.content
         new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif model in ["deepseek-reasoner"]:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        response = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": system_message},
+                *new_msg_history,
+            ],
+            n=1,
+            stop=None,
+        )
+        content = response.choices[0].message.content
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
+    elif "gemini" in model:
+        new_msg_history = msg_history + [{"role": "user", "content": msg}]
+        gemini_contents = [{"role": "system", "parts": system_message}]
+        for m in new_msg_history:
+            gemini_contents.append({"role": m["role"], "parts": m["content"]})
+        response = client.generate_content(
+            contents=gemini_contents,
+            generation_config=GenerationConfig(
+                temperature=temperature,
+                max_output_tokens=MAX_NUM_TOKENS,
+                candidate_count=1,
+            ),
+        )
+        content = response.text
+        new_msg_history = new_msg_history + [{"role": "assistant", "content": content}]
     else:
         raise ValueError(f"Model {model} not supported.")
 
@@ -287,7 +324,7 @@ def create_client(model):
     elif model in ["o1-preview-2024-09-12", "o1-mini-2024-09-12"]:
         print(f"Using OpenAI API with model {model}.")
         return openai.OpenAI(), model
-    elif model == "deepseek-coder-v2-0724":
+    elif model in ["deepseek-chat", "deepseek-reasoner"]:
         print(f"Using OpenAI API with {model}.")
         return openai.OpenAI(
             api_key=os.environ["DEEPSEEK_API_KEY"],
@@ -299,5 +336,10 @@ def create_client(model):
             api_key=os.environ["OPENROUTER_API_KEY"],
             base_url="https://openrouter.ai/api/v1"
         ), "meta-llama/llama-3.1-405b-instruct"
+    elif "gemini" in model:
+        print(f"Using Google Generative AI with model {model}.")
+        genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+        client = genai.GenerativeModel(model)
+        return client, model
     else:
         raise ValueError(f"Model {model} not supported.")
diff --git a/launch_scientist.py b/launch_scientist.py
@@ -98,6 +98,27 @@ def get_available_gpus(gpu_ids=None):
     return list(range(torch.cuda.device_count()))
 
 
+def check_latex_dependencies():
+    """
+    Check if required LaTeX dependencies are installed on the system.
+    Returns True if all dependencies are found, False otherwise.
+    """
+    import shutil
+    import sys
+
+    required_dependencies = ['pdflatex', 'chktex']
+    missing_deps = []
+
+    for dep in required_dependencies:
+        if shutil.which(dep) is None:
+            missing_deps.append(dep)
+    
+    if missing_deps:
+        print("Error: Required LaTeX dependencies not found:", file=sys.stderr)
+        return False
+    
+    return True
+    
 def worker(
         queue,
         base_dir,
@@ -304,6 +325,10 @@ def do_idea(
 
     print(f"Using GPUs: {available_gpus}")
 
+    # Check LaTeX dependencies before proceeding
+    if args.writeup == "latex" and not check_latex_dependencies():
+        sys.exit(1)
+
     # Create client
     client, client_model = create_client(args.model)