OpenSourceEconomics · MaxBlesch · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/soepy/simulate/initial_states.py b/soepy/simulate/initial_states.py
@@ -1,123 +1,58 @@
 import numpy as np
 import pandas as pd
 
-from soepy.shared.experience_stock import exp_years_to_stock
-from soepy.shared.experience_stock import get_pt_increment
-from soepy.shared.experience_stock import stock_to_exp_years
-from soepy.shared.numerical_integration import draw_zero_one_distributed_shocks
-
-
-def prepare_simulation_data(
-    model_params,
-    model_spec,
-    prob_educ_level,
-    prob_child_age,
-    prob_partner_present,
-    prob_exp_pt,
-    prob_exp_ft,
-    biased_exp,
-):
-    """Draw initial conditions and precompute grid objects for simulation."""
-
-    initial_educ_level = np.random.choice(
-        model_spec.num_educ_levels, model_spec.num_agents_sim, p=prob_educ_level
-    )
-
-    initial_period = np.asarray(model_spec.educ_years)[initial_educ_level]
-
-    initial_child_age = np.full(model_spec.num_agents_sim, np.nan)
-    initial_partner = np.full(model_spec.num_agents_sim, np.nan)
-    initial_exp_pt = np.full(model_spec.num_agents_sim, np.nan)
-    initial_exp_ft = np.full(model_spec.num_agents_sim, np.nan)
-
-    for educ_level in range(model_spec.num_educ_levels):
-        mask = initial_educ_level == educ_level
-
-        initial_child_age[mask] = np.random.choice(
-            list(range(-1, model_spec.child_age_init_max + 1)),
-            mask.sum(),
-            p=prob_child_age[educ_level],
-        )
-
-        initial_partner[mask] = np.random.binomial(
-            size=mask.sum(),
-            n=1,
-            p=prob_partner_present[educ_level],
-        )
-        initial_exp_pt[mask] = np.random.choice(
-            list(range(0, model_spec.init_exp_max + 1)),
-            mask.sum(),
-            p=prob_exp_pt[educ_level],
-        )
-
-        initial_exp_ft[mask] = np.random.choice(
-            list(range(0, model_spec.init_exp_max + 1)),
-            mask.sum(),
-            p=prob_exp_ft[educ_level],
-        )
-
-    # Combine stocks
-    pt_increment = get_pt_increment(
-        model_params=model_params,
-        educ_level=initial_educ_level,
-        child_age=initial_child_age,
-        biased_exp=biased_exp,
-    )
-
-    total_years = initial_exp_pt * pt_increment + initial_exp_ft
-
-    initial_exp_stock = exp_years_to_stock(
-        exp_years=total_years,
-        period=0,
-        init_exp_max=model_spec.init_exp_max,
-        pt_increment=pt_increment,
-    )
-
-    lagged_choice = lagged_choice_initial(initial_exp_years=total_years)
-
-    unobserved_type = np.random.choice(
-        np.arange(model_spec.num_types),
-        model_spec.num_agents_sim,
-        p=model_params.type_shares,
-    )
-
-    draws_sim = draw_zero_one_distributed_shocks(
-        model_spec.seed_sim, model_spec.num_periods, model_spec.num_agents_sim
-    )
-    draws_sim = draws_sim * float(model_params.shock_sd)
-
-    # prob_exp_pt/prob_exp_ft are passed in from simulate.py and originate from the
-    # same legacy share files as prob_exp_years.
-    initial_states = pd.DataFrame(
-        {
-            "Identifier": np.arange(model_spec.num_agents_sim, dtype=int),
-            "Period": initial_period.astype(int),
-            "Education_Level": initial_educ_level.astype(int),
-            "Lagged_Choice": lagged_choice.astype(int),
-            "Experience_Part_Time": initial_exp_pt.astype(int),
-            "Experience_Full_Time": initial_exp_ft.astype(int),
-            "Experience_Stock": initial_exp_stock.astype(float),
-            "Type": unobserved_type.astype(int),
-            "Age_Youngest_Child": initial_child_age.astype(int),
-            "Partner_Indicator": initial_partner.astype(int),
-        }
-    )
-
-    return initial_states, draws_sim
-
-
-def lagged_choice_initial(initial_exp_years):
-    """Determine initial lagged choice from total experience.
-
-    Rule (per project decision):
-    - lagged_choice = 2 (full-time) if initial experience years > 1
-    - otherwise lagged_choice = 0
-    """
-
-    lagged_choice = np.zeros_like(initial_exp_years, dtype=int)
-    lagged_choice[initial_exp_years > 1] = 2
-    # Check if initial years is a float if so, assign 1 part-time
-    int_exp = initial_exp_years.astype(int)
-    is_float = np.abs(initial_exp_years - int_exp) > 1e-8
-    lagged_choice[is_float] = 1
-    return lagged_choice
+from soepy.shared.constants_and_indices import NUM_CHOICES
+
+
+def validate_initial_states(initial_states, model_spec):
+    required_columns = [
+        "Identifier",
+        "Period",
+        "Education_Level",
+        "Lagged_Choice",
+        "Experience_Part_Time",
+        "Experience_Full_Time",
+        "Type",
+        "Age_Youngest_Child",
+        "Partner_Indicator",
+    ]
+    missing = [col for col in required_columns if col not in initial_states.columns]
+    if missing:
+        raise ValueError(f"Initial states missing columns: {missing}")
+
+    if initial_states[required_columns].isna().any().any():
+        raise ValueError("Initial states contain missing values.")
+
+    identifiers = initial_states["Identifier"].to_numpy()
+    unique_ids = np.unique(identifiers)
+    if len(unique_ids) != len(identifiers):
+        raise ValueError("Initial state identifiers must be unique.")
+
+    if not np.issubdtype(unique_ids.dtype, np.integer):
+        raise ValueError("Initial state identifiers must be integers.")
+
+    expected_ids = np.arange(len(unique_ids), dtype=unique_ids.dtype)
+    if not np.array_equal(np.sort(unique_ids), expected_ids):
+        raise ValueError("Initial state identifiers must be consecutive from 0.")
+
+    # if (initial_states["Period"] < 0).any() or (
+    #     initial_states["Period"] >= model_spec.num_periods
+    # ).any():
+    #     raise ValueError("Initial state periods out of bounds.")
+
+    if (initial_states["Education_Level"] < 0).any() or (
+        initial_states["Education_Level"] >= model_spec.num_educ_levels
+    ).any():
+        raise ValueError("Initial state education levels out of bounds.")
+
+    if (initial_states["Type"] < 0).any() or (
+        initial_states["Type"] >= model_spec.num_types
+    ).any():
+        raise ValueError("Initial state types out of bounds.")
+
+    if (initial_states["Lagged_Choice"] < 0).any() or (
+        initial_states["Lagged_Choice"] >= NUM_CHOICES
+    ).any():
+        raise ValueError("Initial state lagged choices out of bounds.")
+
+    return initial_states
diff --git a/soepy/simulate/simulate_auxiliary.py b/soepy/simulate/simulate_auxiliary.py
@@ -3,18 +3,20 @@
 
 from soepy.shared.constants_and_indices import HOURS
 from soepy.shared.constants_and_indices import NUM_CHOICES
+from soepy.shared.experience_stock import exp_years_to_stock
 from soepy.shared.experience_stock import get_pt_increment
 from soepy.shared.experience_stock import next_stock
 from soepy.shared.non_employment import calc_erziehungsgeld
 from soepy.shared.non_employment import calculate_non_employment_consumption_resources
+from soepy.shared.numerical_integration import draw_zero_one_distributed_shocks
 from soepy.shared.wages import calculate_log_wage
 from soepy.simulate.constants_sim import DATA_FORMATS_SIM
 from soepy.simulate.constants_sim import DATA_FORMATS_SPARSE
 from soepy.simulate.constants_sim import DATA_LABLES_SIM
 from soepy.simulate.constants_sim import LABELS_DATA_SPARSE
 from soepy.simulate.constants_sim import STATE_LABELS_SIM
 from soepy.simulate.income_sim import calculate_employment_consumption_resources
-from soepy.simulate.initial_states import prepare_simulation_data
+from soepy.simulate.initial_states import validate_initial_states
 
 
 def pyth_simulate(
@@ -26,33 +28,47 @@ def pyth_simulate(
     covariates,
     non_consumption_utilities,
     child_age_update_rule,
-    prob_educ_level,
-    prob_child_age,
-    prob_partner_present,
-    prob_exp_pt,
-    prob_exp_ft,
     prob_child,
     prob_partner,
+    *,
     biased_exp,
+    initial_states,
     data_sparse=False,
 ):
     """Simulate agent histories under the continuous-experience model."""
 
     np.random.seed(model_spec.seed_sim)
 
+    initial_states = validate_initial_states(initial_states, model_spec)
+    num_agents_sim = initial_states["Identifier"].nunique()
+
     emaxs = np.asarray(emaxs)
     non_consumption_utilities = np.asarray(non_consumption_utilities)
 
-    initial_states, draws_sim = prepare_simulation_data(
+    pt_increment = get_pt_increment(
         model_params=model_params,
-        model_spec=model_spec,
-        prob_educ_level=prob_educ_level,
-        prob_child_age=prob_child_age,
-        prob_partner_present=prob_partner_present,
-        prob_exp_pt=prob_exp_pt,
-        prob_exp_ft=prob_exp_ft,
-        biased_exp=biased_exp,
+        educ_level=initial_states["Education_Level"].to_numpy(),
+        child_age=initial_states["Age_Youngest_Child"].to_numpy(),
+        biased_exp=False,
+    )
+    total_years = (
+        initial_states["Experience_Part_Time"].to_numpy() * pt_increment
+        + initial_states["Experience_Full_Time"].to_numpy()
+    )
+    initial_states = initial_states.copy()
+    initial_states["Experience_Stock"] = exp_years_to_stock(
+        exp_years=total_years,
+        period=initial_states["Period"].to_numpy(),
+        init_exp_max=model_spec.init_exp_max,
+        pt_increment=pt_increment,
+    ).astype(float)
+
+    draws_sim = draw_zero_one_distributed_shocks(
+        model_spec.seed_sim,
+        model_spec.num_periods,
+        num_agents_sim,
     )
+    draws_sim = draws_sim * float(model_params.shock_sd)
 
     data_list = simulate_agents_over_periods(
         model_spec=model_spec,
@@ -161,7 +177,8 @@ def simulate_agents_over_periods(
             )
         )
 
-        wages = np.exp(log_wage_agents + draws_sim[period, : len(current_states)])
+        identifiers = current_states.iloc[:, state_col["Identifier"]].to_numpy()
+        wages = np.exp(log_wage_agents + draws_sim[period, identifiers])
         wages = wages * float(model_spec.elasticity_scale)
 
         female_income = wages[:, None] * HOURS[None, 1:]