diff --git a/.gitignore b/.gitignore index 77f63a50..72d70baf 100755 --- a/.gitignore +++ b/.gitignore @@ -394,3 +394,8 @@ rxivdraft/MANUSCRIPT/2025__antónio_d_brito_et_al__rxiv.pdf .gemini-clipboard/ gha-creds-*.json fatetracking/MANUSCRIPT/.rxiv_cache + +# Claude-mem context (override negations above) +**/CLAUDE.md + +**/*.docx diff --git a/CHANGELOG.md b/CHANGELOG.md index b4ca5f6c..8f146342 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [1.19.0] - 2026-02-05 + +### Added + +- **bioRxiv Submission Package Command**: New `rxiv biorxiv` command generates complete submission package + - Generates bioRxiv author template (TSV format) with HTML entity encoding for special characters + - Includes manuscript PDF, source files (TeX, figures, bibliography) + - Creates ZIP archive ready for bioRxiv upload + - Supports custom submission directory and ZIP filename options + - HTML entity encoding for accented characters (António → António, Åbo → Åbo) + - Automatic handling of multiple corresponding authors (keeps last one) + - Command options: `--biorxiv-dir`, `--zip-filename`, `--no-zip` + +### Changed + +- **Code Architecture**: Centralized common submission logic in BaseCommand + - Refactored ArxivCommand and BioRxivCommand to share common patterns + - Added `_clear_output_directory()`, `_ensure_pdf_built()`, `_set_submission_defaults()` helper methods + - Eliminated ~64 lines of duplicated code between commands + - Improved maintainability and consistency across submission commands + +### Fixed + +- **bioRxiv Character Encoding**: Special characters now properly encoded as HTML entities + - Previously stripped accents to ASCII (António → Antonio) + - Now preserves original characters using HTML entities (António → António) + - Complies with bioRxiv's TSV import requirements for international author names + ## [1.18.5] - 2026-02-05 ### Fixed diff --git a/src/rxiv_maker/__version__.py b/src/rxiv_maker/__version__.py index 44b8fcf2..acb125c8 100755 --- a/src/rxiv_maker/__version__.py +++ b/src/rxiv_maker/__version__.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "1.18.5" +__version__ = "1.19.0" diff --git a/src/rxiv_maker/cli/commands/__init__.py b/src/rxiv_maker/cli/commands/__init__.py index 9ed160c9..dc5fa5d8 100755 --- a/src/rxiv_maker/cli/commands/__init__.py +++ b/src/rxiv_maker/cli/commands/__init__.py @@ -2,6 +2,7 @@ from .arxiv import arxiv from .bibliography import bibliography +from .biorxiv import biorxiv from .build import build as pdf from .cache_management import cache_group as cache from .changelog import changelog @@ -29,6 +30,7 @@ __all__ = [ "arxiv", "bibliography", + "biorxiv", "cache", "changelog", "config", diff --git a/src/rxiv_maker/cli/commands/biorxiv.py b/src/rxiv_maker/cli/commands/biorxiv.py new file mode 100644 index 00000000..f32a5008 --- /dev/null +++ b/src/rxiv_maker/cli/commands/biorxiv.py @@ -0,0 +1,51 @@ +"""bioRxiv submission package generation command.""" + +import rich_click as click + +from ..framework.workflow_commands import BioRxivCommand + + +@click.command(context_settings={"help_option_names": ["-h", "--help"]}) +@click.argument("manuscript_path", type=click.Path(exists=True, file_okay=False), required=False) +@click.option("--output-dir", "-o", default="output", help="Output directory for generated files") +@click.option( + "--biorxiv-dir", + "-b", + help="Custom bioRxiv submission directory (default: output/biorxiv_submission)", +) +@click.option( + "--zip-filename", + "-z", + help="Custom ZIP filename (default: {manuscript}_biorxiv.zip)", +) +@click.option( + "--no-zip", + is_flag=True, + help="Don't create ZIP file (only create submission directory)", +) +@click.pass_context +def biorxiv(ctx, manuscript_path, output_dir, biorxiv_dir, zip_filename, no_zip): + r"""Generate bioRxiv submission package. + + Creates a complete submission package including: + - bioRxiv author template (TSV file) + - Manuscript PDF + - Source files (TeX, figures, bibliography) + - ZIP file for upload + + \b + Example: + rxiv biorxiv # Full package with ZIP + rxiv biorxiv --no-zip # Package without ZIP + rxiv biorxiv -b custom_dir # Custom submission directory + rxiv biorxiv -z my_submission.zip # Custom ZIP filename + """ + command = BioRxivCommand() + return command.run( + ctx, + manuscript_path=manuscript_path, + output_dir=output_dir, + biorxiv_dir=biorxiv_dir, + zip_filename=zip_filename, + no_zip=no_zip, + ) diff --git a/src/rxiv_maker/cli/framework/base.py b/src/rxiv_maker/cli/framework/base.py index 552b3785..313680e1 100644 --- a/src/rxiv_maker/cli/framework/base.py +++ b/src/rxiv_maker/cli/framework/base.py @@ -162,6 +162,97 @@ def error_message(self, message: str, suggestion: Optional[str] = None) -> None: if suggestion: self.console.print(f"💡 {suggestion}", style="yellow") + def _clear_output_directory(self) -> None: + """Clear and recreate the output directory. + + Raises: + CommandExecutionError: If path_manager is not initialized + """ + import shutil + + if not self.path_manager: + raise CommandExecutionError("Path manager not initialized") + + if self.path_manager.output_dir.exists(): + shutil.rmtree(self.path_manager.output_dir) + self.path_manager.output_dir.mkdir(parents=True, exist_ok=True) + + def _ensure_pdf_built(self, progress_task=None, quiet: bool = True) -> None: + """Ensure PDF is built, building it if necessary. + + Args: + progress_task: Optional progress task to update + quiet: Whether to suppress build output + + Raises: + CommandExecutionError: If path_manager is not initialized or build fails + """ + from ...engines.operations.build_manager import BuildManager + + if not self.path_manager: + raise CommandExecutionError("Path manager not initialized") + + pdf_filename = f"{self.path_manager.manuscript_name}.pdf" + pdf_path = self.path_manager.output_dir / pdf_filename + + if not pdf_path.exists(): + if progress_task: + progress_task.update(description="Building PDF first...") + + build_manager = BuildManager( + manuscript_path=str(self.path_manager.manuscript_path), + output_dir=str(self.path_manager.output_dir), + verbose=self.verbose, + quiet=quiet, + ) + + try: + success = build_manager.build() + if not success: + raise CommandExecutionError("PDF build failed") + except Exception as e: + raise CommandExecutionError(f"Failed to build PDF: {e}") from e + + def _set_submission_defaults( + self, + submission_type: str, + submission_dir: Optional[str] = None, + zip_filename: Optional[str] = None, + ) -> tuple[str, str]: + """Set default paths for submission directories and ZIP files. + + Args: + submission_type: Type of submission ("arxiv" or "biorxiv") + submission_dir: Custom submission directory path (optional) + zip_filename: Custom ZIP filename (optional) + + Returns: + Tuple of (submission_dir, zip_filename) with defaults applied + + Raises: + CommandExecutionError: If path_manager is not initialized + """ + from pathlib import Path + + if not self.path_manager: + raise CommandExecutionError("Path manager not initialized") + + manuscript_output_dir = str(self.path_manager.output_dir) + + # Set default submission directory + if submission_dir is None: + submission_dir = str(Path(manuscript_output_dir) / f"{submission_type}_submission") + + # Set default ZIP filename + if zip_filename is None: + manuscript_name = self.path_manager.manuscript_name + if submission_type == "arxiv": + zip_filename = str(Path(manuscript_output_dir) / "for_arxiv.zip") + else: + zip_filename = str(Path(manuscript_output_dir) / f"{manuscript_name}_{submission_type}.zip") + + return submission_dir, zip_filename + @abstractmethod def execute_operation(self, **kwargs) -> Any: """Execute the main command operation. diff --git a/src/rxiv_maker/cli/framework/workflow_commands.py b/src/rxiv_maker/cli/framework/workflow_commands.py index 89c179f1..7858428f 100644 --- a/src/rxiv_maker/cli/framework/workflow_commands.py +++ b/src/rxiv_maker/cli/framework/workflow_commands.py @@ -366,46 +366,25 @@ def execute_operation( no_zip: Don't create zip file """ import sys - from pathlib import Path - from rxiv_maker.engines.operations.build_manager import BuildManager from rxiv_maker.engines.operations.prepare_arxiv import main as prepare_arxiv_main - if self.path_manager is None: - raise CommandExecutionError("Path manager not initialized") - + # Set defaults using shared helper + arxiv_dir, zip_filename = self._set_submission_defaults("arxiv", arxiv_dir, zip_filename) manuscript_output_dir = str(self.path_manager.output_dir) - # Set defaults using PathManager - if arxiv_dir is None: - arxiv_dir = str(Path(manuscript_output_dir) / "arxiv_submission") - if zip_filename is None: - zip_filename = str(Path(manuscript_output_dir) / "for_arxiv.zip") - with self.create_progress() as progress: - # Clear output directory first (similar to PDF command) + # Clear output directory using shared helper task = progress.add_task("Clearing output directory...", total=None) - if self.path_manager.output_dir.exists(): - shutil.rmtree(self.path_manager.output_dir) - self.path_manager.output_dir.mkdir(parents=True, exist_ok=True) + self._clear_output_directory() - # First, ensure PDF is built + # Ensure PDF is built using shared helper progress.update(task, description="Checking PDF exists...") - pdf_filename = f"{self.path_manager.manuscript_name}.pdf" - pdf_path = self.path_manager.output_dir / pdf_filename - - if not pdf_path.exists(): - progress.update(task, description="Building PDF first...") - build_manager = BuildManager( - manuscript_path=str(self.path_manager.manuscript_path), - output_dir=str(self.path_manager.output_dir), - verbose=self.verbose, - quiet=False, - ) - success = build_manager.run() - if not success: - self.error_message("PDF build failed. Cannot prepare arXiv package.") - raise CommandExecutionError("PDF build failed") + try: + self._ensure_pdf_built(progress_task=task, quiet=False) + except CommandExecutionError: + self.error_message("PDF build failed. Cannot prepare arXiv package.") + raise # Prepare arXiv package progress.update(task, description="Preparing arXiv package...") @@ -524,6 +503,95 @@ def _extract_author_and_year(self, config_path: Path) -> tuple[str, str]: return year, first_author +class BioRxivCommand(BaseCommand): + """BioRxiv command implementation for generating submission package.""" + + def execute_operation( + self, + output_dir: str = "output", + biorxiv_dir: Optional[str] = None, + zip_filename: Optional[str] = None, + no_zip: bool = False, + ) -> None: + """Execute bioRxiv submission package preparation. + + Args: + output_dir: Output directory for generated files + biorxiv_dir: Custom bioRxiv submission directory path + zip_filename: Custom zip filename + no_zip: Don't create zip file + """ + from pathlib import Path + + from ...engines.operations.prepare_biorxiv import ( + BioRxivAuthorError, + create_biorxiv_zip, + generate_biorxiv_author_tsv, + prepare_biorxiv_package, + ) + + # Set defaults using shared helper + biorxiv_dir, zip_filename = self._set_submission_defaults("biorxiv", biorxiv_dir, zip_filename) + + with self.create_progress() as progress: + # Clear output directory using shared helper + task = progress.add_task("Clearing output directory...", total=None) + self._clear_output_directory() + + # Ensure PDF is built using shared helper + progress.update(task, description="Checking PDF exists...") + self._ensure_pdf_built(progress_task=task, quiet=True) + + # Generate bioRxiv author template TSV + progress.update(task, description="Generating bioRxiv author template...") + output_path = self.path_manager.output_dir + tsv_file = output_path / "biorxiv_authors.tsv" + + try: + generate_biorxiv_author_tsv( + config_path=self.path_manager.get_config_file_path(), + output_path=tsv_file, + ) + except BioRxivAuthorError as e: + progress.update(task, completed=True) + raise CommandExecutionError(f"Failed to generate bioRxiv template: {e}") from e + + # Prepare bioRxiv submission package + progress.update(task, description="Preparing bioRxiv submission package...") + try: + biorxiv_path = prepare_biorxiv_package( + manuscript_path=self.path_manager.manuscript_path, + output_dir=self.path_manager.output_dir, + biorxiv_dir=Path(biorxiv_dir), + ) + except Exception as e: + progress.update(task, completed=True) + raise CommandExecutionError(f"Failed to prepare bioRxiv package: {e}") from e + + # Create ZIP file if requested + zip_path = None + if not no_zip: + progress.update(task, description="Creating ZIP package...") + try: + zip_path = create_biorxiv_zip( + biorxiv_path=biorxiv_path, + zip_filename=zip_filename, + manuscript_path=self.path_manager.manuscript_path, + ) + except Exception as e: + progress.update(task, completed=True) + raise CommandExecutionError(f"Failed to create ZIP: {e}") from e + + progress.update(task, completed=True) + + # Show success message + self.console.print("\n[green]✅ bioRxiv submission package ready![/green]") + self.console.print(f" 📁 Package directory: {biorxiv_path}") + if zip_path: + self.console.print(f" 📦 ZIP file: {zip_path}") + self.console.print("\n📤 Upload to: https://submit.biorxiv.org/") + + class TrackChangesCommand(BaseCommand): """Track changes command implementation using the framework.""" diff --git a/src/rxiv_maker/cli/main.py b/src/rxiv_maker/cli/main.py index 3163ab60..a8e38174 100755 --- a/src/rxiv_maker/cli/main.py +++ b/src/rxiv_maker/cli/main.py @@ -60,7 +60,7 @@ }, { "name": "Workflow Commands", - "commands": ["get-rxiv-preprint", "arxiv", "track-changes", "setup"], + "commands": ["get-rxiv-preprint", "arxiv", "biorxiv", "track-changes", "setup"], }, { "name": "Configuration", @@ -252,6 +252,7 @@ def main( main.add_command(commands.figures) main.add_command(commands.get_rxiv_preprint, name="get-rxiv-preprint") main.add_command(commands.arxiv) +main.add_command(commands.biorxiv) main.add_command(commands.init) main.add_command(commands.bibliography) main.add_command(commands.track_changes) diff --git a/src/rxiv_maker/engines/operations/prepare_biorxiv.py b/src/rxiv_maker/engines/operations/prepare_biorxiv.py new file mode 100644 index 00000000..76993d2a --- /dev/null +++ b/src/rxiv_maker/engines/operations/prepare_biorxiv.py @@ -0,0 +1,401 @@ +"""Prepare bioRxiv author submission template (TSV format). + +This module generates a tab-separated values (TSV) file containing author information +formatted for bioRxiv submission system upload. +""" + +import csv +import html.entities +import logging +import shutil +import zipfile +from pathlib import Path + +from ...core.managers.config_manager import ConfigManager +from ...utils.author_name_formatter import parse_author_name +from ...utils.email_encoder import decode_email + +logger = logging.getLogger(__name__) + + +def encode_html_entities(text: str) -> str: + """Convert Unicode characters to HTML entities for bioRxiv submission. + + bioRxiv's TSV upload requires special characters to be encoded as HTML entities. + For example, "António" becomes "António", "Åbo" becomes "Åbo". + + Args: + text: Text that may contain Unicode characters + + Returns: + Text with Unicode characters converted to HTML entities + (e.g., "António" -> "António", "Åbo" -> "Åbo") + + Examples: + >>> encode_html_entities("António") + 'António' + >>> encode_html_entities("Åbo") + 'Åbo' + >>> encode_html_entities("José García") + 'José García' + """ + if not text: + return text + + # Build reverse mapping: Unicode character -> HTML entity name + char_to_entity = {} + for entity_name, codepoint in html.entities.name2codepoint.items(): + char = chr(codepoint) + # Skip basic ASCII characters and use named entities for special chars + if ord(char) > 127: + char_to_entity[char] = f"&{entity_name};" + + # Convert each character to HTML entity if it has one + result = [] + for char in text: + if char in char_to_entity: + result.append(char_to_entity[char]) + else: + result.append(char) + + return "".join(result) + + +class BioRxivAuthorError(Exception): + """Exception raised for bioRxiv author template generation errors.""" + + pass + + +def validate_author_data(authors: list[dict]) -> None: + """Validate author data for bioRxiv submission requirements. + + Args: + authors: List of author dictionaries from config + + Raises: + BioRxivAuthorError: If validation fails + """ + if not authors: + raise BioRxivAuthorError("No authors found in configuration") + + # Count corresponding authors + corresponding_count = sum(1 for author in authors if author.get("corresponding_author", False)) + + if corresponding_count == 0: + raise BioRxivAuthorError( + "No corresponding author found. " + "Exactly one author must be marked with 'corresponding_author: true' in 00_CONFIG.yml" + ) + + if corresponding_count > 1: + corresponding_names = [ + author.get("name", "Unknown") for author in authors if author.get("corresponding_author", False) + ] + raise BioRxivAuthorError( + f"Multiple corresponding authors found: {', '.join(corresponding_names)}. " + "Only one author should be marked with 'corresponding_author: true' in 00_CONFIG.yml" + ) + + # Validate each author has a name + for i, author in enumerate(authors): + if not author.get("name"): + raise BioRxivAuthorError(f"Author at index {i} is missing the 'name' field") + + +def format_author_row(author_data: dict, affiliation_map: dict) -> list[str]: + """Format a single author's data as a bioRxiv TSV row. + + Args: + author_data: Author dictionary with processed data + affiliation_map: Dictionary mapping affiliation shortnames to full data + + Returns: + List of column values in bioRxiv order: + Email, Institution, First Name, Middle Name(s)/Initial(s), Last Name, Suffix, + Corresponding Author, Home Page URL, Collaborative Group/Consortium, ORCiD + """ + # Email (decoded from email64) + email = author_data.get("email", "") + + # Institution (first affiliation's full_name) - encode HTML entities for bioRxiv + institution = "" + affiliations = author_data.get("affiliations", []) + if affiliations and affiliations[0] in affiliation_map: + institution = encode_html_entities(affiliation_map[affiliations[0]].get("full_name", "")) + + # Parse name into components and encode HTML entities for bioRxiv + name_str = author_data.get("name", "") + name_parts = parse_author_name(name_str) + + first_name = encode_html_entities(name_parts.get("first", "")) + middle_name = encode_html_entities(name_parts.get("middle", "")) + last_name = encode_html_entities(name_parts.get("last", "")) + suffix = encode_html_entities(name_parts.get("suffix", "")) + + # Corresponding author (any text for Yes, empty string for No) + corresponding = "Yes" if author_data.get("corresponding_author", False) else "" + + # Home Page URL (empty - user preference) + home_page_url = "" + + # Collaborative Group/Consortium (empty) + collaborative_group = "" + + # ORCiD (if present) + orcid = author_data.get("orcid", "") + + return [ + email, + institution, + first_name, + middle_name, + last_name, + suffix, + corresponding, + home_page_url, + collaborative_group, + orcid, + ] + + +def generate_biorxiv_author_tsv(config_path: Path, output_path: Path) -> Path: + """Generate bioRxiv author submission template (TSV format). + + Args: + config_path: Path to the manuscript 00_CONFIG.yml file + output_path: Path where the TSV file should be written + + Returns: + Path to the generated TSV file + + Raises: + BioRxivAuthorError: If author data is invalid or missing + FileNotFoundError: If config file doesn't exist + """ + if not config_path.exists(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + # Load manuscript configuration + config_manager = ConfigManager(config_path.parent) + config = config_manager.load_config(config_path) + + # Extract authors and affiliations + authors = config.get("authors", []) + affiliations = config.get("affiliations", []) + + # Handle multiple corresponding authors: keep only the last one + corresponding_indices = [i for i, author in enumerate(authors) if author.get("corresponding_author", False)] + if len(corresponding_indices) > 1: + # Unmark all but the last corresponding author + for idx in corresponding_indices[:-1]: + authors[idx]["corresponding_author"] = False + logger.warning( + f"Multiple corresponding authors found. Only keeping the last one: " + f"{authors[corresponding_indices[-1]].get('name', 'Unknown')}" + ) + + # Validate author data + validate_author_data(authors) + + # Build affiliation map (shortname -> full data) + affiliation_map = {} + for affiliation in affiliations: + shortname = affiliation.get("shortname", "") + if shortname: + affiliation_map[shortname] = affiliation + + # Process authors: decode emails + processed_authors = [] + for author in authors: + author_copy = author.copy() + + # Decode email64 if present + if "email64" in author_copy: + try: + author_copy["email"] = decode_email(author_copy["email64"]) + except ValueError as e: + logger.warning(f"Failed to decode email64 for {author_copy.get('name', 'Unknown')}: {e}") + author_copy["email"] = "" + elif "email" not in author_copy: + author_copy["email"] = "" + + processed_authors.append(author_copy) + + # Generate TSV file + output_path.parent.mkdir(parents=True, exist_ok=True) + + with open(output_path, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f, delimiter="\t", quoting=csv.QUOTE_MINIMAL, lineterminator="\n") + + # Write header row + header = [ + "Email", + "Institution", + "First Name", + "Middle Name(s)/Initial(s)", + "Last Name", + "Suffix", + "Corresponding Author", + "Home Page URL", + "Collaborative Group/Consortium", + "ORCiD", + ] + writer.writerow(header) + + # Write author rows + for author in processed_authors: + row = format_author_row(author, affiliation_map) + writer.writerow(row) + + logger.info(f"Generated bioRxiv author template: {output_path}") + return output_path + + +def prepare_biorxiv_package( + manuscript_path: Path, + output_dir: Path, + biorxiv_dir: Path | None = None, +) -> Path: + """Prepare bioRxiv submission package. + + Creates a directory containing: + - biorxiv_authors.tsv (author template) + - manuscript PDF + - source files (TeX, figures, bibliography) + + Args: + manuscript_path: Path to the manuscript directory + output_dir: Path to the rxiv-maker output directory + biorxiv_dir: Path where bioRxiv submission files will be created. + If None, defaults to {output_dir}/biorxiv_submission + + Returns: + Path to the bioRxiv submission directory + + Raises: + FileNotFoundError: If required files are missing + """ + output_path = Path(output_dir) + + # Default bioRxiv directory to be inside the output directory + if biorxiv_dir is None: + biorxiv_dir = output_path / "biorxiv_submission" + + biorxiv_path = Path(biorxiv_dir) + + # Create clean bioRxiv directory + if biorxiv_path.exists(): + shutil.rmtree(biorxiv_path) + biorxiv_path.mkdir(parents=True) + + manuscript_name = manuscript_path.name if manuscript_path else "manuscript" + logger.info(f"Preparing bioRxiv submission package for '{manuscript_name}' in {biorxiv_path}") + + # 1. Copy the bioRxiv authors TSV file (already generated) + tsv_source = output_path / "biorxiv_authors.tsv" + if not tsv_source.exists(): + raise FileNotFoundError( + f"bioRxiv author template not found: {tsv_source}\n" + "Please run TSV generation first or ensure output directory is correct." + ) + shutil.copy2(tsv_source, biorxiv_path / "biorxiv_authors.tsv") + logger.info("✓ Copied author template: biorxiv_authors.tsv") + + # 2. Find and copy the manuscript PDF + pdf_files = list(output_path.glob("*.pdf")) + main_pdf = None + for pdf in pdf_files: + # Skip supplementary PDFs + if "supplementary" not in pdf.name.lower(): + main_pdf = pdf + break + + if not main_pdf: + logger.warning("⚠ No manuscript PDF found in output directory") + else: + shutil.copy2(main_pdf, biorxiv_path / main_pdf.name) + logger.info(f"✓ Copied manuscript PDF: {main_pdf.name}") + + # 3. Copy source files for submission + # Copy TeX files + tex_files = list(output_path.glob("*.tex")) + for tex_file in tex_files: + shutil.copy2(tex_file, biorxiv_path / tex_file.name) + logger.info(f"✓ Copied TeX file: {tex_file.name}") + + # Copy style file + style_file = output_path / "rxiv_maker_style.cls" + if style_file.exists(): + shutil.copy2(style_file, biorxiv_path / "rxiv_maker_style.cls") + logger.info("✓ Copied style file: rxiv_maker_style.cls") + + # Copy bibliography + bib_file = output_path / "03_REFERENCES.bib" + if bib_file.exists(): + shutil.copy2(bib_file, biorxiv_path / "03_REFERENCES.bib") + logger.info("✓ Copied bibliography: 03_REFERENCES.bib") + + # Copy FIGURES directory + figures_source = output_path / "FIGURES" + if figures_source.exists() and figures_source.is_dir(): + figures_dest = biorxiv_path / "FIGURES" + shutil.copytree(figures_source, figures_dest) + figure_count = len(list(figures_dest.rglob("*"))) + logger.info(f"✓ Copied FIGURES directory ({figure_count} files)") + + logger.info(f"\n📦 bioRxiv package prepared in {biorxiv_path}") + return biorxiv_path + + +def create_biorxiv_zip( + biorxiv_path: Path, + zip_filename: str = "biorxiv_submission.zip", + manuscript_path: Path | None = None, +) -> Path: + """Create a ZIP file for bioRxiv submission. + + Args: + biorxiv_path: Path to the bioRxiv submission directory + zip_filename: Name of the ZIP file to create + manuscript_path: Optional manuscript path for naming + + Returns: + Path to the created ZIP file + """ + # Use manuscript-aware naming if manuscript path is provided + if manuscript_path and zip_filename == "biorxiv_submission.zip": + manuscript_name = manuscript_path.name + zip_filename = f"{manuscript_name}_biorxiv.zip" + + zip_path = Path(zip_filename).resolve() + + # Define auxiliary files that should be excluded + auxiliary_extensions = {".aux", ".blg", ".log", ".out", ".fls", ".fdb_latexmk", ".synctex.gz"} + + logger.info(f"\n📁 Creating ZIP package: {zip_path}") + + excluded_files = [] + included_files = [] + + with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf: + for file_path in biorxiv_path.rglob("*"): + if file_path.is_file(): + # Check if file should be excluded (auxiliary files) + should_exclude = file_path.suffix.lower() in auxiliary_extensions + + if should_exclude: + excluded_files.append(file_path.name) + continue + + # Store files with relative paths + arcname = file_path.relative_to(biorxiv_path) + zipf.write(file_path, arcname) + included_files.append(str(arcname)) + + logger.info(f"✅ ZIP created: {zip_path}") + logger.info(f" Files included: {len(included_files)}") + if excluded_files: + logger.info(f" Files excluded: {len(excluded_files)} (auxiliary files)") + + return zip_path diff --git a/tests/unit/test_biorxiv_command.py b/tests/unit/test_biorxiv_command.py new file mode 100644 index 00000000..97230391 --- /dev/null +++ b/tests/unit/test_biorxiv_command.py @@ -0,0 +1,17 @@ +"""Unit tests for bioRxiv CLI command.""" + +from click.testing import CliRunner + +from rxiv_maker.cli.commands.biorxiv import biorxiv + + +class TestBioRxivCommand: + """Test the bioRxiv CLI command.""" + + def test_biorxiv_command_help(self): + """Test that help message displays correctly.""" + runner = CliRunner() + result = runner.invoke(biorxiv, ["--help"]) + assert result.exit_code == 0 + assert "bioRxiv submission package" in result.output + assert "author template" in result.output or "TSV file" in result.output diff --git a/tests/unit/test_prepare_biorxiv.py b/tests/unit/test_prepare_biorxiv.py new file mode 100644 index 00000000..60ff7057 --- /dev/null +++ b/tests/unit/test_prepare_biorxiv.py @@ -0,0 +1,369 @@ +"""Unit tests for bioRxiv author template generation.""" + +import csv + +import pytest + +from rxiv_maker.engines.operations.prepare_biorxiv import ( + BioRxivAuthorError, + encode_html_entities, + format_author_row, + generate_biorxiv_author_tsv, + validate_author_data, +) + + +class TestValidateAuthorData: + """Test author data validation.""" + + def test_no_authors(self): + """Test error when no authors provided.""" + with pytest.raises(BioRxivAuthorError, match="No authors found"): + validate_author_data([]) + + def test_no_corresponding_author(self): + """Test error when no corresponding author marked.""" + authors = [ + {"name": "John Smith", "corresponding_author": False}, + {"name": "Jane Doe", "corresponding_author": False}, + ] + with pytest.raises(BioRxivAuthorError, match="No corresponding author found"): + validate_author_data(authors) + + def test_multiple_corresponding_authors(self): + """Test error when multiple corresponding authors marked.""" + authors = [ + {"name": "John Smith", "corresponding_author": True}, + {"name": "Jane Doe", "corresponding_author": True}, + ] + with pytest.raises(BioRxivAuthorError, match="Multiple corresponding authors found"): + validate_author_data(authors) + + def test_valid_single_corresponding_author(self): + """Test validation passes with single corresponding author.""" + authors = [ + {"name": "John Smith", "corresponding_author": False}, + {"name": "Jane Doe", "corresponding_author": True}, + ] + # Should not raise + validate_author_data(authors) + + def test_missing_name(self): + """Test error when author missing name field.""" + authors = [ + {"email": "test@example.com", "corresponding_author": True}, + ] + with pytest.raises(BioRxivAuthorError, match="missing the 'name' field"): + validate_author_data(authors) + + +class TestFormatAuthorRow: + """Test formatting individual author rows.""" + + def test_basic_author_formatting(self): + """Test basic author formatting with all fields.""" + author_data = { + "name": "John A. Smith", + "email": "john@example.com", + "affiliations": ["inst1"], + "corresponding_author": True, + "orcid": "0000-0001-2345-6789", + } + affiliation_map = { + "inst1": {"full_name": "Example University"}, + } + + row = format_author_row(author_data, affiliation_map) + + assert len(row) == 10 + assert row[0] == "john@example.com" # Email + assert row[1] == "Example University" # Institution + assert row[2] == "John" # First name + assert row[3] == "A." # Middle name + assert row[4] == "Smith" # Last name + assert row[5] == "" # Suffix + assert row[6] == "Yes" # Corresponding author + assert row[7] == "" # Home page URL + assert row[8] == "" # Collaborative group + assert row[9] == "0000-0001-2345-6789" # ORCiD + + def test_non_corresponding_author(self): + """Test non-corresponding author formatting.""" + author_data = { + "name": "Jane Doe", + "email": "jane@example.com", + "affiliations": ["inst1"], + "corresponding_author": False, + } + affiliation_map = { + "inst1": {"full_name": "Example University"}, + } + + row = format_author_row(author_data, affiliation_map) + assert row[6] == "" # Corresponding author should be empty string for non-corresponding + + def test_comma_format_name(self): + """Test parsing comma-format names.""" + author_data = { + "name": "Smith, John Alan", + "email": "", + "affiliations": [], + "corresponding_author": False, + } + affiliation_map = {} + + row = format_author_row(author_data, affiliation_map) + assert row[2] == "John" # First name + assert row[3] == "Alan" # Middle name + assert row[4] == "Smith" # Last name + + def test_name_with_suffix(self): + """Test parsing names with suffixes.""" + author_data = { + "name": "Martin, James Jr.", + "email": "", + "affiliations": [], + "corresponding_author": False, + } + affiliation_map = {} + + row = format_author_row(author_data, affiliation_map) + assert row[2] == "James" # First name + assert row[4] == "Martin" # Last name + assert row[5] == "Jr." # Suffix + + def test_no_email(self): + """Test author without email.""" + author_data = { + "name": "John Smith", + "affiliations": [], + "corresponding_author": False, + } + affiliation_map = {} + + row = format_author_row(author_data, affiliation_map) + assert row[0] == "" # Email should be empty string + + def test_no_affiliation(self): + """Test author without affiliation.""" + author_data = { + "name": "John Smith", + "email": "", + "affiliations": [], + "corresponding_author": False, + } + affiliation_map = {} + + row = format_author_row(author_data, affiliation_map) + assert row[1] == "" # Institution should be empty string + + def test_multiple_affiliations_uses_first(self): + """Test that only first affiliation is used.""" + author_data = { + "name": "John Smith", + "email": "", + "affiliations": ["inst1", "inst2"], + "corresponding_author": False, + } + affiliation_map = { + "inst1": {"full_name": "Primary University"}, + "inst2": {"full_name": "Secondary Institute"}, + } + + row = format_author_row(author_data, affiliation_map) + assert row[1] == "Primary University" # Only first affiliation + + +class TestGenerateBiorxivAuthorTsv: + """Test full TSV generation.""" + + def test_tsv_generation_creates_file(self, tmp_path): + """Test that TSV file is created successfully.""" + # Create a minimal config file + config_path = tmp_path / "00_CONFIG.yml" + config_content = """ +authors: + - name: John Smith + email: john@example.com + affiliations: [inst1] + corresponding_author: true + orcid: 0000-0001-2345-6789 + +affiliations: + - shortname: inst1 + full_name: Example University +""" + config_path.write_text(config_content) + + output_path = tmp_path / "output" / "biorxiv_authors.tsv" + + # Generate TSV + result_path = generate_biorxiv_author_tsv(config_path, output_path) + + # Verify file exists + assert result_path.exists() + assert result_path == output_path + + def test_tsv_format_and_content(self, tmp_path): + """Test that TSV has correct format and content.""" + config_path = tmp_path / "00_CONFIG.yml" + config_content = """ +authors: + - name: Smith, John A. + email: john@example.com + affiliations: [inst1] + corresponding_author: true + orcid: 0000-0001-2345-6789 + +affiliations: + - shortname: inst1 + full_name: Example University +""" + config_path.write_text(config_content) + + output_path = tmp_path / "biorxiv_authors.tsv" + generate_biorxiv_author_tsv(config_path, output_path) + + # Read and verify TSV + with open(output_path, newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t") + rows = list(reader) + + # Check header + assert len(rows) == 2 # Header + 1 author + header = rows[0] + assert header[0] == "Email" + assert header[2] == "First Name" + assert header[6] == "Corresponding Author" + + # Check author data + author_row = rows[1] + assert author_row[0] == "john@example.com" + assert author_row[1] == "Example University" + assert author_row[2] == "John" + assert author_row[3] == "A." + assert author_row[4] == "Smith" + assert author_row[6] == "Yes" + assert author_row[9] == "0000-0001-2345-6789" + + def test_email64_decoding(self, tmp_path): + """Test that email64 is properly decoded.""" + config_path = tmp_path / "00_CONFIG.yml" + # Base64 for "test@example.com" + config_content = """ +authors: + - name: John Smith + email64: dGVzdEBleGFtcGxlLmNvbQ== + affiliations: [inst1] + corresponding_author: true + +affiliations: + - shortname: inst1 + full_name: Example University +""" + config_path.write_text(config_content) + + output_path = tmp_path / "biorxiv_authors.tsv" + generate_biorxiv_author_tsv(config_path, output_path) + + with open(output_path, newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t") + rows = list(reader) + + assert rows[1][0] == "test@example.com" + + def test_config_file_not_found(self, tmp_path): + """Test error when config file doesn't exist.""" + config_path = tmp_path / "nonexistent.yml" + output_path = tmp_path / "output.tsv" + + with pytest.raises(FileNotFoundError): + generate_biorxiv_author_tsv(config_path, output_path) + + def test_multiple_authors(self, tmp_path): + """Test TSV with multiple authors.""" + config_path = tmp_path / "00_CONFIG.yml" + config_content = """ +authors: + - name: John Smith + email: john@example.com + affiliations: [inst1] + corresponding_author: false + - name: Jane Doe + email: jane@example.com + affiliations: [inst2] + corresponding_author: true + +affiliations: + - shortname: inst1 + full_name: University A + - shortname: inst2 + full_name: University B +""" + config_path.write_text(config_content) + + output_path = tmp_path / "biorxiv_authors.tsv" + generate_biorxiv_author_tsv(config_path, output_path) + + with open(output_path, newline="", encoding="utf-8") as f: + reader = csv.reader(f, delimiter="\t") + rows = list(reader) + + assert len(rows) == 3 # Header + 2 authors + assert rows[1][6] == "" # First author not corresponding (empty string) + assert rows[2][6] == "Yes" # Second author is corresponding + + +class TestEncodeHtmlEntities: + """Test HTML entity encoding for special characters.""" + + def test_accented_characters(self): + """Test encoding of common accented characters.""" + # Portuguese/Spanish characters + assert encode_html_entities("António") == "António" + assert encode_html_entities("José") == "José" + assert encode_html_entities("García") == "García" + + def test_nordic_characters(self): + """Test encoding of Nordic characters.""" + assert encode_html_entities("Åbo") == "Åbo" + assert encode_html_entities("Øyvind") == "Øyvind" + + def test_complex_text(self): + """Test encoding of text with multiple special characters.""" + text = "Instituto de Tecnologia Química e Biológica António Xavier" + expected = "Instituto de Tecnologia Química e Biológica António Xavier" + assert encode_html_entities(text) == expected + + def test_no_special_characters(self): + """Test that text without special characters is unchanged.""" + text = "Bruno Saraiva" + assert encode_html_entities(text) == text + + def test_empty_string(self): + """Test that empty string is handled correctly.""" + assert encode_html_entities("") == "" + + def test_none_value(self): + """Test that None is handled correctly.""" + assert encode_html_entities(None) is None + + def test_author_formatting_with_html_entities(self): + """Test that author formatting correctly encodes HTML entities.""" + author_data = { + "name": "António da Silva", + "email": "antonio@example.com", + "affiliations": ["inst1"], + "corresponding_author": False, + } + affiliation_map = { + "inst1": {"full_name": "Universidade de São Paulo"}, + } + + row = format_author_row(author_data, affiliation_map) + + # Check that name components have HTML entities + assert row[2] == "António" # First name + assert row[4] == "da Silva" # Last name (no special chars) + # Check institution has HTML entities + assert "São Paulo" in row[1]