csv2latex/test_pipeline.py at main · sentient-codebot/csv2latex · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
#!/usr/bin/env python3
"""
Comprehensive pipeline test for CSV to LaTeX converter.

This script tests the complete pipeline including:
1. CSV loading and data processing
2. Generalized configuration features
3. Row filtering and sorting
4. Value replacements
5. Pattern formatting
6. Column formatting
7. LaTeX generation with underlines
8. Backward compatibility

Usage:
    python test_pipeline.py

Files used:
    - test_data.csv: Sample data with year, month, cluster, generative, scaled columns
    - test_config.yaml: Comprehensive configuration demonstrating all features
"""

import pandas as pd
import sys
import os

# Add src to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

from csv2latex.config import ConfigManager
from csv2latex.utils import DataProcessor
from csv2latex.latex import LatexFormatter


def test_complete_pipeline():
    """Test the complete CSV to LaTeX conversion pipeline"""
    print("=" * 80)
    print("COMPREHENSIVE CSV TO LATEX PIPELINE TEST")
    print("=" * 80)

    # Step 1: Load data and configuration
    print("\\n1. LOADING DATA AND CONFIGURATION")
    print("-" * 40)

    try:
        df = pd.read_csv("test_data.csv")
        config = ConfigManager()
        config.load_config_file("test_config.yaml")

        print(f"✅ Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
        print(f"✅ Configuration loaded: {config.config_path}")
        print(f"   Columns: {list(df.columns)}")
        print(f"   Data types: {dict(df.dtypes)}")

    except Exception as e:
        print(f"❌ Error loading data/config: {e}")
        return False

    # Step 2: Test generalized row filtering
    print("\\n2. TESTING GENERALIZED ROW FILTERING")
    print("-" * 40)

    original_shape = df.shape
    filtered_df = DataProcessor.filter_excluded_values(df, config)

    excluded_clusters = set(df['cluster'].unique()) - set(filtered_df['cluster'].unique())
    print(f"✅ Original data: {original_shape[0]} rows")
    print(f"✅ After filtering: {filtered_df.shape[0]} rows")
    print(f"✅ Excluded clusters: {sorted(excluded_clusters)}")

    if excluded_clusters != {4, 5}:
        print(f"❌ Expected to exclude clusters 4,5 but excluded {excluded_clusters}")
        return False

    # Step 3: Test generalized row sorting
    print("\\n3. TESTING GENERALIZED ROW SORTING")
    print("-" * 40)

    sorted_df = DataProcessor.sort_by_custom_order(filtered_df, config)

    print("✅ Data sorted by custom order")
    print("   First 10 rows (year, month, cluster):")
    for i, (_, row) in enumerate(sorted_df[['year', 'month', 'cluster']].head(10).iterrows()):
        print(f"     {i+1:2d}. {row['year']}-{row['month']:2d}-{row['cluster']}")

    # Step 4: Test value replacements
    print("\\n4. TESTING VALUE REPLACEMENTS")
    print("-" * 40)

    test_replacements = [
        ("year", 2022, "Year 1"),
        ("year", 2023, "Year 2"),
        ("month", 0, "Jan"),
        ("month", 6, "Jul"),
        ("cluster", 0, "0"),  # No replacement configured
    ]

    replacement_success = True
    for col, value, expected in test_replacements:
        result = config.get_value_replacement(col, value)
        status = "✅" if result == expected else "❌"
        print(f"   {status} {col}[{value}] -> '{result}' (expected: '{expected}')")
        if result != expected:
            replacement_success = False

    if not replacement_success:
        print("❌ Some value replacements failed")
        return False

    # Step 5: Test pattern formatting
    print("\\n5. TESTING PATTERN FORMATTING")
    print("-" * 40)

    generative_patterns = config.get_column_patterns("generative")
    scaled_patterns = config.get_column_patterns("scaled")

    print(f"✅ Generative patterns: {generative_patterns}")
    print(f"✅ Scaled patterns: {scaled_patterns}")

    # Step 6: Test column formatting
    print("\\n6. TESTING COLUMN FORMATTING")
    print("-" * 40)

    format_tests = [
        ("year", 2022.0, "d"),
        ("month", 0.0, "d"),
        ("generative", 0.0092, ".4f"),
        ("scaled", 0.0180, ".4f"),
    ]

    for col, value, expected_format in format_tests:
        actual_format = config.get_column_format(col)
        status = "✅" if actual_format == expected_format else "❌"
        print(f"   {status} {col} format: '{actual_format}' (expected: '{expected_format}')")

    # Step 7: Test LaTeX generation
    print("\\n7. TESTING LATEX GENERATION")
    print("-" * 40)

    latex_formatter = LatexFormatter(config)
    selected_columns = {col: config.get_pretty_column_name(col) for col in sorted_df.columns}

    # Generate LaTeX for first 5 rows
    latex_output = latex_formatter.generate_latex_table(
        sorted_df.head(5), selected_columns, decimal_places=4
    )

    print("✅ LaTeX table generated successfully")
    print(f"   Table style: {config.table_style}")
    print("   Sample output (first data row):")

    # Extract and display first data row
    latex_lines = latex_output.split('\\n')
    for line in latex_lines:
        if '&' in line and 'textbf' not in line and 'hline' not in line and 'rule' not in line:
            print(f"     {line}")
            break

    # Step 8: Test table styles
    print("\\n8. TESTING TABLE STYLES")
    print("-" * 40)

    # Test both table styles
    for style in ['hline', 'booktabs']:
        print(f"   Testing {style} style:")

        # Temporarily change config
        original_style = config._config.get('table_style', 'hline')
        config._config['table_style'] = style

        # Generate LaTeX with this style
        style_formatter = LatexFormatter(config)
        style_output = style_formatter.generate_latex_table(
            sorted_df.head(2), selected_columns, decimal_places=4
        )

        if style == 'hline':
            has_hline = '\\hline' in style_output
            has_booktabs = '\\toprule' in style_output or '\\midrule' in style_output or '\\bottomrule' in style_output
            print(f"     ✅ Uses \\hline: {has_hline}")
            print(f"     ✅ No booktabs rules: {not has_booktabs}")
        else:
            has_toprule = '\\toprule' in style_output
            has_midrule = '\\midrule' in style_output
            has_bottomrule = '\\bottomrule' in style_output
            has_hline = '\\hline' in style_output
            print(f"     ✅ Uses \\toprule: {has_toprule}")
            print(f"     ✅ Uses \\midrule: {has_midrule}")
            print(f"     ✅ Uses \\bottomrule: {has_bottomrule}")
            print(f"     ✅ No \\hline: {not has_hline}")

        # Restore original style
        config._config['table_style'] = original_style

    # Step 9: Test specific features in LaTeX output
    print("\\n9. TESTING LATEX OUTPUT FEATURES")
    print("-" * 40)

    features_found = {
        "Year replacements": any(year_name in latex_output for year_name in ["Year 1", "Year 2", "Year 3"]),
        "Month replacements": any(month_name in latex_output for month_name in ["Jan", "Feb", "Jul"]),
        "Underlined values": "\\underline{" in latex_output,
        "Pattern symbols": any(symbol in latex_output for symbol in ["\\dagger", "\\ddagger", "\\ast"]),
        "Extra columns": "\\checkmark" in latex_output,
        "Proper formatting": "$" in latex_output,
        "Booktabs style": "\\toprule" in latex_output and "\\midrule" in latex_output and "\\bottomrule" in latex_output,
    }

    all_features_work = True
    for feature, found in features_found.items():
        status = "✅" if found else "❌"
        print(f"   {status} {feature}: {'Found' if found else 'Not found'}")
        if not found and feature != "Pattern symbols":  # Pattern symbols depend on data
            all_features_work = False

    # Step 10: Test backward compatibility
    print("\\n10. TESTING BACKWARD COMPATIBILITY")
    print("-" * 40)

    # Test with legacy model configuration
    legacy_config = ConfigManager()
    legacy_config._config = {
        'model_order': {'test-model-1': 1, 'test-model-2': 2},
        'latex_model_names': {'test-model-1': 'Test Model 1'},
        'ignored_models': ['debug-model']
    }

    legacy_tests = [
        ("get_sort_order", lambda: legacy_config.get_sort_order('model', 'test-model-1') == 1),
        ("get_value_replacement", lambda: legacy_config.get_value_replacement('model', 'test-model-1') == 'Test Model 1'),
        ("should_exclude_value", lambda: legacy_config.should_exclude_value('model', 'debug-model') == True),
    ]

    for test_name, test_func in legacy_tests:
        try:
            result = test_func()
            status = "✅" if result else "❌"
            print(f"   {status} Legacy {test_name}: {'Passed' if result else 'Failed'}")
        except Exception as e:
            print(f"   ❌ Legacy {test_name}: Error - {e}")
            all_features_work = False

    # Final summary
    print("\\n" + "=" * 80)
    print("PIPELINE TEST SUMMARY")
    print("=" * 80)

    if all_features_work and replacement_success:
        print("🎉 ALL TESTS PASSED! The complete pipeline is working correctly.")
        print("\\n✅ Key features verified:")
        print("   • Generalized row filtering and sorting")
        print("   • Value replacements for all columns")
        print("   • Pattern formatting and custom column formats")
        print("   • LaTeX generation with underlines and symbols")
        print("   • Configurable table styles (hline vs booktabs)")
        print("   • Backward compatibility with legacy configurations")
        print("   • Extra columns and comprehensive formatting")
        return True
    else:
        print("❌ SOME TESTS FAILED! Please review the errors above.")
        return False


def main():
    """Main entry point"""
    success = test_complete_pipeline()
    sys.exit(0 if success else 1)


if __name__ == "__main__":
    main()