This might be the ticket?

marcleblanc2 · marcleblanc2 · commit e7cf8b7840e8 · 2025-07-16T02:31:44.000-06:00
diff --git a/build/docker-compose.yaml b/build/docker-compose.yaml
@@ -19,7 +19,7 @@ services:
       - LOG_LEVEL=DEBUG # DEBUG INFO WARNING ERROR CRITICAL # Default is INFO
       # - LOG_RECENT_COMMITS=5
       - MAX_CONCURRENT_CONVERSIONS_GLOBAL=20
-      - MAX_CONCURRENT_CONVERSIONS_PER_SERVER=10
+      - MAX_CONCURRENT_CONVERSIONS_PER_SERVER=3
       # - MAX_CYCLES=5
       - REPO_CONVERTER_INTERVAL_SECONDS=30 # Default is 3600 seconds (1 hour)
 #    image: ghcr.io/sourcegraph/repo-converter:HEAD
diff --git a/dev/TODO.md b/dev/TODO.md
@@ -58,8 +58,8 @@
 ### Stability
 
 - `git svn fetch`
-    - Updated batch end rev is not getting written to .git/config, file perms issue?
-    - Updated batch end rev is not getting read from .git/config, file perms issue?
+    - Commits not getting committed to the local repo
+    - Not entirely sure if svn is blocking me
 
 - `svn log` commands
     - Longest commands, which seem to be timing out and causing issues
diff --git a/src/source_repo/svn.py b/src/source_repo/svn.py
@@ -34,8 +34,6 @@ def convert(ctx: Context) -> None:
     Entrypoint / main logic / orchestration function
     """
 
-    job_start_time = int(time.time())
-
     # Extract repo conversion job config values from the repos list in ctx,
     # and set default values for required but undefined configs
     _extract_repo_config_and_set_default_values(ctx)
@@ -75,14 +73,14 @@ def convert(ctx: Context) -> None:
     if _check_if_repo_already_up_to_date(ctx):
 
         # If the repo already exists, and is already up to date, then exit early
-
         ### EXTERNAL COMMAND: svn log ###
-        _log_recent_commits(ctx, commands)
-        _cleanup(ctx)
+        _cleanup(ctx, commands)
+        log(ctx, "Ending svn repo conversion job", "info")
         return
 
     ### EXTERNAL COMMAND: svn log ###
     # This is the big one, to count all revs remaining
+    # TODO: Separate the svn log range from calculating batch revisions
     _log_number_of_revs_out_of_date(ctx, commands)
 
     # Calculate revision range for this fetch
@@ -95,24 +93,14 @@ def convert(ctx: Context) -> None:
     git_svn_fetch_result = _git_svn_fetch(ctx, commands)
 
     ## Gather information needed to decide if the fetch was successful or failed
-
-    # Check if the repo is valid after the fetch
-    if _check_if_repo_exists_locally(ctx, "end"):
-        pass
-
     # Cleanup before exit
-    _cleanup(ctx)
-
-    # Get dir size of converted git repo
-    _get_local_git_repo_stats(ctx, "end")
-
-    ctx.job["result"]["run_time_seconds"] = int(time.time() - job_start_time)
+    _cleanup(ctx, commands)
 
     ## Decide if the fetch was successful or failed
     ## Also update batch end rev in git repo config file
     _verify_git_svn_fetch_success(ctx, git_svn_fetch_result)
 
-    log(ctx, "SVN repo conversion job complete", "info")
+    log(ctx, "Ending svn repo conversion job", "info")
 
 
 def _extract_repo_config_and_set_default_values(ctx: Context) -> None:
@@ -675,14 +663,19 @@ def _log_recent_commits(ctx: Context, commands: dict) -> None:
         ctx.job.pop("svn_log_output")
 
 
-def _cleanup(ctx: Context) -> None:
+def _cleanup(ctx: Context, commands: dict) -> None:
     """
     Groups up any other functions needed to clean up before exit
     """
 
+    # Get dir size of converted git repo
+    _get_local_git_repo_stats(ctx, "end")
+
+    _log_recent_commits(ctx, commands)
+
     # Run git garbage collection and cleanup branches, even if repo is already up to date
-    git.garbage_collection(ctx)
     git.cleanup_branches_and_tags(ctx)
+    git.garbage_collection(ctx)
 
 
 def _log_number_of_revs_out_of_date(ctx: Context, commands: dict) -> None:
@@ -713,7 +706,7 @@ def _log_number_of_revs_out_of_date(ctx: Context, commands: dict) -> None:
         log(ctx, "Logging remaining_revs; note: this is an expensive operation", "info")
 
 
-def _calculate_batch_revisions(ctx: Context, commands: dict) -> dict:
+def _calculate_batch_revisions(ctx: Context, commands: dict) -> bool:
     """
     Run the svn log command to calculate batch start and end revisions for fetching
     """
@@ -730,53 +723,79 @@ def _calculate_batch_revisions(ctx: Context, commands: dict) -> dict:
     # Pick a revision number to start with; may or may not be a real rev number
     this_batch_start_rev    = int(previous_batch_end_rev + 1)
 
-
     # Run the svn log command to get real revision numbers for this batch
-    cmd_svn_log_get_batch_revs                  = cmd_svn_log + ["--limit", str(fetch_batch_size), "--revision", f"{this_batch_start_rev}:HEAD"]
-    cmd_svn_log_get_batch_revs_result           = cmd.run_subprocess(ctx, cmd_svn_log_get_batch_revs, password, name="cmd_svn_log_get_batch_revs")
-    cmd_svn_log_get_batch_revs_output_list      = list(cmd_svn_log_get_batch_revs_result.get("output",""))
-    cmd_svn_log_get_batch_revs_output_string    = " ".join(cmd_svn_log_get_batch_revs_output_list)
-
-    if cmd_svn_log_get_batch_revs_result["return_code"] == 0    and \
-        len(cmd_svn_log_get_batch_revs_output_list) > 0         and \
-        "revision" in cmd_svn_log_get_batch_revs_output_string:
+    cmd_svn_log_get_batch_revs  = cmd_svn_log + ["--limit", str(fetch_batch_size), "--revision", f"{this_batch_start_rev}:HEAD"]
+    process_result              = cmd.run_subprocess(ctx, cmd_svn_log_get_batch_revs, password, name="cmd_svn_log_get_batch_revs")
+    log_details                 = {"process": process_result}
+    output_list                 = list(process_result.get("output",""))
+    output_string               = " ".join(output_list)
+    len_output_list             = len(output_list)
+    # Start off as a set type for built-in deduplication
+    list_of_revs_this_batch     = set()
+
+    if process_result["return_code"] == 0   and \
+        len_output_list > 0                 and \
+        "revision" in output_string:
+
+        ## Extract the specific revisions from the svn log output
+        # "output": [
+        #     "<?xml version=\"1.0\" encoding=\"UTF-8\"?>",
+        #     "<log>",
+        #     "<logentry",
+        #     "   revision=\"1636921\">",
+        #     "</logentry>",
+        #     "<logentry",
+        #     "   revision=\"1636922\">",
+        #     "</logentry>",
+        #     "</log>"
+        # ],
+
+        for line in output_list:
+            if "revision" in line:
+                list_of_revs_this_batch.add(int(line.split("revision=\"")[1].split("\"")[0]))
+
+        # Then convert to a list for sorting
+        list_of_revs_this_batch = sorted(list_of_revs_this_batch)
 
         # Update the this batch's starting rev to the first real rev number after the previous end rev
-        this_batch_start_rev = int(" ".join(cmd_svn_log_get_batch_revs_output_list).split("revision=\"")[1].split("\"")[0])
+        this_batch_start_rev = min(list_of_revs_this_batch)
         ctx.job["stats"]["local"]["this_batch_start_rev"] = this_batch_start_rev
 
-        # Reverse the output so we can get the last revision number
-        cmd_svn_log_get_batch_revs_output_list.reverse()
-        this_batch_end_rev = int(" ".join(cmd_svn_log_get_batch_revs_output_list).split("revision=\"")[1].split("\"")[0])
+        # Get the last revision number
+        this_batch_end_rev = max(list_of_revs_this_batch)
         ctx.job["stats"]["local"]["this_batch_end_rev"] = this_batch_end_rev
 
     else:
         log_failure_message = "Failed to get batch revs from svn log"
 
 
-    ## Check if the output isn't as long as we were expecting
-    # Expected output number of lines for
-    # svn log --xml --with-no-revprops --non-interactive --limit 10 --revision 1:HEAD
-    # is 3 lines per revision
-    # and 3 lines for xml format start / end
-    expected_output_list_len = (fetch_batch_size * 3) + 3
-
-    if len(cmd_svn_log_get_batch_revs_output_list) < expected_output_list_len:
-        log_failure_message = "svn log returned fewer lines than expected"
-
-
     ## Count how many revs are in the svn log output
-    revs_in_svn_log_output = cmd_svn_log_get_batch_revs_output_string.count("revision=")
+    len_list_of_revs_this_batch = len(list_of_revs_this_batch)
     # Grab the min, in case we are close to the current rev,
     # and there are fewer revs remaining than our current batch size
-    fetching_batch_count = min(revs_in_svn_log_output, fetch_batch_size)
+    fetching_batch_count = min(len_list_of_revs_this_batch, fetch_batch_size)
     # Store it in the job stats dict
-    ctx.job["stats"]["local"]["fetching_batch_count"] = fetching_batch_count
+    ctx.job["stats"]["local"]["fetching_batch_count"]       = fetching_batch_count
+    ctx.job["stats"]["local"]["list_of_revs_this_batch"]    = list_of_revs_this_batch
+
+
+    # ## Check if the output isn't as long as we were expecting
+    # This isn't a valid check, as
+    # some repos are smaller than our batch size,
+    # and once the repo conversion catches up to the latest rev,
+    # there will be fewer commits to convert each run
+    # # Expected output number of lines for
+    # # svn log --xml --with-no-revprops --non-interactive --limit 10 --revision 1:HEAD
+    # # is 3 lines per revision
+    # # and 3 lines for xml format start / end
+    # expected_output_list_len = (fetch_batch_size * 3) + 3
+    # if len_output_list < expected_output_list_len:
+    #     log(ctx, f"svn log returned fewer lines: {len_output_list} than expected: {expected_output_list_len}", "warning", log_details)
 
 
     if log_failure_message:
         set_job_result(ctx, "skipped", log_failure_message, False)
-        log(ctx, log_failure_message, "error")
+        log(ctx, log_failure_message, "error", log_details)
         return False
 
     else:
@@ -857,11 +876,18 @@ def _verify_git_svn_fetch_success(ctx: Context, git_svn_fetch_result: dict) -> N
     ## Gather needed inputs
     action                          = "git svn fetch"
     ctx.job["result"]["failures"]   = []
+    git_svn_fetch_output_for_errors = list(git_svn_fetch_result.get("output",""))
     git_svn_fetch_output            = list(git_svn_fetch_result.get("output",""))
     job_config                      = ctx.job.get("config","")
     job_stats_local                 = ctx.job.get("stats","").get("local","")
     structured_log_dict             = {"process": git_svn_fetch_result}
 
+
+    # Check if the repo is valid after the fetch
+    if _check_if_repo_exists_locally(ctx, "end"):
+        pass
+
+
     ## Check for any errors in the command output
     # TODO: Test the error message processing
 
@@ -874,7 +900,7 @@ def _verify_git_svn_fetch_success(ctx: Context, git_svn_fetch_result: dict) -> N
 
     # Remove the not_error lines from the output list
     for not_error in not_errors:
-        git_svn_fetch_output = [x for x in git_svn_fetch_output if not re.search(not_error, x)]
+        git_svn_fetch_output_for_errors = [x for x in git_svn_fetch_output_for_errors if not re.search(not_error, x)]
 
     # Check for expected error messages
     # We should keep this list tidy, as execution time is
@@ -945,24 +971,27 @@ def _verify_git_svn_fetch_success(ctx: Context, git_svn_fetch_result: dict) -> N
     for error_category in error_message_regex_patterns_dict.keys():
         for error_message_regex_pattern in error_message_regex_patterns_dict.get(error_category):
 
+            regex_pattern = rf".*{error_message_regex_pattern}.*"
+            regex = re.compile(regex_pattern, flags=re.IGNORECASE)
+
             # We need the line match, but testing the match across the entire list first to reduce the exponential runtime
-            list_match = re.search(error_message_regex_pattern, " ".join(git_svn_fetch_output))
+            list_match = regex.search(" ".join(git_svn_fetch_output_for_errors))
 
             if list_match:
                 for match_group in list_match.groups():
-                    for line in git_svn_fetch_output:
+                    for line in git_svn_fetch_output_for_errors:
 
                         # Re-running the match, as list_match may match across lines,
                         # but we only want to match within each line
-                        line_match = re.search(error_message_regex_pattern, line)
+                        line_match = regex.search(line)
 
                         if line_match:
 
                             ctx.job["result"]["failures"].append(f"Error message: {error_category}: {line}")
 
                             # Remove the svn fetch error line from the process output list to avoid duplicate output,
                             # if one line in the error message matches multiple error_messages
-                            git_svn_fetch_output.remove(line)
+                            git_svn_fetch_output_for_errors.remove(line)
 
 
     ## Get the latest commit from the git repo's commit logs
@@ -989,6 +1018,13 @@ def _verify_git_svn_fetch_success(ctx: Context, git_svn_fetch_result: dict) -> N
         ctx.job["result"]["failures"].append(f"git_commits_added: {git_commits_added} != fetch_batch_size: {fetch_batch_size}")
 
 
+    ## Count how many, and which revs were checked in this fetch
+    # Verify each of them are in the git log output
+    # TODO: Implement this
+    # git_svn_fetch_output
+
+    ## Make final success / fail call
+
     if len(ctx.job["result"]["failures"]) > 0:
 
         reason = "output failed verification"
diff --git a/src/utils/concurrency_manager.py b/src/utils/concurrency_manager.py
@@ -93,7 +93,7 @@ def acquire_job_slot(self, ctx: Context) -> bool:
 
                     if active_job_repo == this_job_repo:
                         set_job_result(ctx, "skipped", "Repo job already in progress", False)
-                        log(ctx, f"Skipping; Repo job already in progress; repo: {active_job_repo}, timestamp: {active_job_timestamp}; trace: {active_job_trace}; running for: {int(time.time() - active_job_timestamp)} seconds", "info")
+                        log(ctx, f"{this_job_repo} Skipping; Repo job already in progress; started at: {active_job_timestamp}; trace: {active_job_trace}; running for: {int(time.time() - active_job_timestamp)} seconds", "info")
                         return False
 
         ## Add this job to the dict of waiting jobs, just in case the blocking semaphore acquire takes a while
@@ -112,17 +112,17 @@ def acquire_job_slot(self, ctx: Context) -> bool:
 
         # Check the semaphore value for number of remaining slots
         if server_semaphore.get_value() <= 0:
-            log(ctx, f"Hit per-server concurrency limit; MAX_CONCURRENT_CONVERSIONS_PER_SERVER={self.per_server_limit}, waiting for a server slot", "info", log_concurrency_status=True)
+            log(ctx, f"{this_job_repo} Hit per-server concurrency limit; MAX_CONCURRENT_CONVERSIONS_PER_SERVER={self.per_server_limit}, waiting for a server slot", "info", log_concurrency_status=True)
 
         ## Check global limit
         if self.global_semaphore.get_value() <= 0:
-            log(ctx, f"Hit global concurrency limit; MAX_CONCURRENT_CONVERSIONS_GLOBAL={self.global_limit}, waiting for a slot", "info", log_concurrency_status=True)
+            log(ctx, f"{this_job_repo} Hit global concurrency limit; MAX_CONCURRENT_CONVERSIONS_GLOBAL={self.global_limit}, waiting for a slot", "info", log_concurrency_status=True)
 
         ## Acquire a slot in the the server-specific semaphore
         # Want to block, so that the main loop has to wait until all repos get a chance to run through before finishing
         if not server_semaphore.acquire(block=True):
 
-            log(ctx, "server_semaphore.acquire failed", "error", log_concurrency_status=True)
+            log(ctx, f"{this_job_repo} server_semaphore.acquire failed", "error", log_concurrency_status=True)
             return False
 
         ## Acquire a slot in the the global semaphore
@@ -132,7 +132,7 @@ def acquire_job_slot(self, ctx: Context) -> bool:
             # Release the server semaphore since we couldn't get the global one
             server_semaphore.release()
 
-            log(ctx, "self.global_semaphore.acquire failed", "error", log_concurrency_status=True)
+            log(ctx, f"{this_job_repo} self.global_semaphore.acquire failed", "error", log_concurrency_status=True)
             return False
 
         ## Successfully acquired both semaphores
@@ -171,8 +171,10 @@ def acquire_job_slot(self, ctx: Context) -> bool:
             # Overwrite the managed list
             self.queued_jobs[server_name] = queued_jobs_list
 
+        ctx.job["result"]["start_timestamp"] = this_job_timestamp
+
         # Log an update
-        log(ctx, f"Acquired job slot", "debug")
+        log(ctx, f"{this_job_repo} Acquired job slot", "debug")
 
         return True
 
@@ -184,6 +186,7 @@ def _get_server_semaphore(self, ctx: Context):
 
         # Get job information from context
         this_job_config = ctx.job.get("config","")
+        this_job_repo   = this_job_config.get("repo_key","")
         server_name     = this_job_config.get("server_name","")
 
         # Wait for the lock to be free
@@ -199,7 +202,7 @@ def _get_server_semaphore(self, ctx: Context):
                 self.per_server_semaphores[server_name] = multiprocessing.Semaphore(self.per_server_limit)
 
                 # Can't log with log_concurrency_status=True, causes a deadlock
-                log(ctx, f"Created concurrency limit semaphore for server {server_name} with limit {self.per_server_limit}", "debug")
+                log(ctx, f"{this_job_repo} Created concurrency limit semaphore for server {server_name} with limit {self.per_server_limit}", "debug")
 
         # Whether the server already had a semaphore in the dict, or one was just created for it, return the semaphore object
         return self.per_server_semaphores[server_name]
@@ -355,7 +358,10 @@ def release_job_slot(self, ctx: Context) -> None:
                 # Overwrite the managed list
                 self.active_jobs[server_name] = server_active_jobs_list
 
-            log(ctx, f"Released job slot", "debug")
+            ctx.job["result"]["end_timestamp"] = int(time.time())
+            ctx.job["result"]["execution_time"] = int(ctx.job["result"]["end_timestamp"] - ctx.job["result"]["start_timestamp"])
+
+            log(ctx, f"{this_job_repo} Released job slot", "debug")
 
         except ValueError as e:
-            log(ctx, f"Error releasing job slot: {e}", "error")
+            log(ctx, f"{this_job_repo} Error releasing job slot: {e}", "error")
diff --git a/src/utils/log.py b/src/utils/log.py
@@ -119,6 +119,18 @@ def _build_structured_payload(
 
     # Merge any job data from the context
     if ctx.job:
+
+        ctx_job_result  = ctx.job.get("result",{})
+        start_timestamp = ctx_job_result.get("start_timestamp")
+        end_timestamp   = ctx_job_result.get("end_timestamp")
+        execution_time  = ctx_job_result.get("execution_time")
+
+        # If the job is still running
+        if start_timestamp and not end_timestamp and not execution_time:
+
+            # Then add a running_time_seconds
+            ctx.job["result"]["running_time_seconds"] = int(time.time() - start_timestamp)
+
         payload.update({"job": dict(ctx.job)})
 
     # Remove any null values