ft-analyzer update flow file preparation to start earlier

pascal260303 · pascal260303 · commit d36986040e47 · 2025-10-12T16:01:32.000+02:00
diff --git a/tools/ft-analyzer/ftanalyzer/models/statistical_model.py b/tools/ft-analyzer/ftanalyzer/models/statistical_model.py
@@ -172,7 +172,7 @@ def __init__(
         if isinstance(reference, str):
             self._ref_path = reference
         else:
-            self._ref_path = self._ref_path = tempfile.NamedTemporaryFile(
+            self._ref_path = tempfile.NamedTemporaryFile(
                 delete=False, prefix="tmp_ref", suffix=".csv"
             ).name
             reference.to_csv(
@@ -186,11 +186,7 @@ def __init__(
         self._ref_ip_addresses_converted = isinstance(reference, pd.DataFrame)
         self._stat_counter = use_statistical_counter
         self._inactive_timeout = inactive_timeout
-
-        try:
-            self._flows_path = self._init_flows(flows)
-        except Exception as err:
-            raise SMException("Unable to read file with flows.") from err
+        self._flows_path = flows
 
         if merge:
             self._merge_flows(biflows_ts_correction)
@@ -222,7 +218,8 @@ def __init__(
             self._future_ref = None
             self._future_sim = None
 
-    def _init_flows(self, path: os.PathLike):
+    @staticmethod
+    def prepare_flows_file(path: os.PathLike, generator_stats: GeneratorStats):
         """initial read of flows.csv in chunks
         replaces faulty values and filters out some flows
 
@@ -238,47 +235,50 @@ def _init_flows(self, path: os.PathLike):
         first_write = True
         logging.getLogger().debug("reading file with flows=%s", path)
         # ports could be empty in flows with protocol like ICMP
-        for chunk in pd.read_csv(
-            path, dtype=self.CSV_COLUMN_TYPES_NULLABLE, chunksize=10_000
-        ):
-            chunk = chunk.fillna(
-                {
-                    "START_TIME": 0,
-                    "END_TIME": 0,
-                    "PROTOCOL": 0,
-                    "SRC_IP": "",
-                    "DST_IP": "",
-                    "SRC_PORT": 0,
-                    "DST_PORT": 0,
-                    "PACKETS": 0,
-                    "BYTES": 0,
-                    "EXPORT_TIME": 0,
-                    "SEQ_NUMBER": 0,
-                    "MSG_LENGTH": 0,
-                }
-            ).astype(self.CSV_COLUMN_TYPES)
-
-            self._zero_icmp_ports(chunk)
-
-            if self._generator_stats.start_time > 0:
-                # filter out flows that start before the start time with 500 ms tolerance
-                chunk = chunk[
-                    chunk["START_TIME"] >= self._generator_stats.start_time - 500
-                ]
-
-            # if stats.end_time > 0:
-            #    # filter out flows that start before the end time
-            #    chunk = chunk[chunk["START_TIME"] <= stats.end_time]
-
-            self._filter_multicast(chunk)
-
-            chunk.to_csv(
-                out_file,
-                index=False,
-                mode="w" if first_write else "a",
-                header=first_write,
-            )
-            first_write = False
+        # open output file once to avoid repeated open/close syscalls
+        with open(out_file, "w", newline="", encoding="ascii") as csvf:
+            for chunk in pd.read_csv(
+                path, dtype=StatisticalModel.CSV_COLUMN_TYPES_NULLABLE, chunksize=10_000
+            ):
+                # fill missing values in-place to avoid extra copy
+                chunk.fillna(
+                    {
+                        "START_TIME": 0,
+                        "END_TIME": 0,
+                        "PROTOCOL": 0,
+                        "SRC_IP": "",
+                        "DST_IP": "",
+                        "SRC_PORT": 0,
+                        "DST_PORT": 0,
+                        "PACKETS": 0,
+                        "BYTES": 0,
+                        "EXPORT_TIME": 0,
+                        "SEQ_NUMBER": 0,
+                        "MSG_LENGTH": 0,
+                    },
+                    inplace=True,
+                )
+
+                chunk = chunk.astype(StatisticalModel.CSV_COLUMN_TYPES)
+
+                # zero ICMP ports (vectorized)
+                StatisticalModel._zero_icmp_ports(chunk)
+
+                # build a single combined mask to apply all filters in one go
+                mask = np.ones(len(chunk), dtype=bool)
+                if generator_stats.start_time > 0:
+                    mask &= chunk["START_TIME"] >= generator_stats.start_time - 500
+
+                # multicast filters: ipv4 and ipv6
+                # DST_IP might be empty string for some rows (we set that above), so startswith is safe
+                mask &= chunk["DST_IP"] != "255.255.255.255"
+                mask &= ~chunk["DST_IP"].str.startswith("ff02:")
+
+                filtered = chunk.loc[mask]
+
+                # write filtered chunk to CSV using the open file handle
+                filtered.to_csv(csvf, index=False, header=first_write)
+                first_write = False
 
         os.remove(path)
         return out_file
@@ -323,7 +323,8 @@ def _load_ref_df(self):
             self._ref_path, engine="pyarrow", dtype=self.CSV_COLUMN_TYPES
         )
 
-    def _zero_icmp_ports(self, df: pd.DataFrame):
+    @staticmethod
+    def _zero_icmp_ports(df: pd.DataFrame):
         icmp_protocols = [1, 58]  # ICMP and ICMPv6
         icmp_mask = df["PROTOCOL"].isin(icmp_protocols)
         df.loc[icmp_mask, ["SRC_PORT", "DST_PORT"]] = 0
diff --git a/tools/ft-orchestration/tests/simulation/test_simulation_general.py b/tools/ft-orchestration/tests/simulation/test_simulation_general.py
@@ -374,6 +374,7 @@ def finalizer_download_logs():
             probe_instance.host_statistics.get_csv(tmp_dir)
 
         flows_file_future.result()
+        flows_file = StatisticalModel.prepare_flows_file(flows_file, stats)
         replicated_ref = replicated_ref_future.result()
 
     stats_report, precise_report = validate(
diff --git a/tools/ft-orchestration/tests/simulation/test_simulation_overload.py b/tools/ft-orchestration/tests/simulation/test_simulation_overload.py
@@ -322,6 +322,7 @@ def finalizer_download_logs():
             probe_instance.host_statistics.get_csv(tmp_dir)
 
         flows_file_future.result()
+        flows_file = StatisticalModel.prepare_flows_file(flows_file, stats)
         replicated_ref = replicated_ref_future.result()
 
     model = StatisticalModel(
diff --git a/tools/ft-orchestration/tests/simulation/test_simulation_threshold.py b/tools/ft-orchestration/tests/simulation/test_simulation_threshold.py
@@ -227,7 +227,9 @@ def finalizer_download_logs():
     request.addfinalizer(cleanup)
     request.addfinalizer(finalizer_download_logs)
 
-    def run_single_test(loops: int, speed: MbpsSpeed) -> tuple[bool, StatisticalReport]:
+    def run_single_test(
+        loops: int, speed: MbpsSpeed, flows_file: os.PathLike
+    ) -> tuple[bool, StatisticalReport]:
         logging.getLogger().info(
             "running test with speed: %s Mbps (loops: %s)", speed.speed, loops
         )
@@ -272,6 +274,7 @@ def run_single_test(loops: int, speed: MbpsSpeed) -> tuple[bool, StatisticalRepo
                 probe_instance.host_statistics.get_csv(tmp_dir)
 
             flows_file_future.result()
+            flows_file = StatisticalModel.prepare_flows_file(flows_file, stats)
             replicated_ref = replicated_ref_future.result()
 
         flow_replicator = None
@@ -333,6 +336,7 @@ def run_single_test(loops: int, speed: MbpsSpeed) -> tuple[bool, StatisticalRepo
             result, report = run_single_test(
                 max(1, int(math.ceil(speed_current / scenario.default.mbps))),
                 MbpsSpeed(speed_current),
+                flows_file,
             )
             report.print_results()
         except Exception as e: