# /// script
# requires-python = ">=3.12"
# dependencies = [
#     "transformers>=5.0.0rc1",
#     "mlx-audio==0.3.0rc1",
#     "click",
#     "numpy",
#     "soundfile",
# ]
# ///
import sys
from pathlib import Path

import click
import numpy as np
import soundfile as sf
from mlx_audio.tts.utils import load_model


def get_unique_filename(base_path: Path) -> Path:
    """Return a unique filename, adding -2, -3, etc. if the file already exists."""
    if not base_path.exists():
        return base_path
    
    stem = base_path.stem
    suffix = base_path.suffix
    parent = base_path.parent
    
    counter = 2
    while True:
        new_path = parent / f"{stem}-{counter}{suffix}"
        if not new_path.exists():
            return new_path
        counter += 1


class CustomHelpCommand(click.Command):
    def format_help(self, ctx, formatter):
        super().format_help(ctx, formatter)
        prog = ctx.info_name
        with formatter.section("Examples"):
            formatter.write_paragraph()
            formatter.write_text(f"{prog} \"say this text out loud\"")
            formatter.write_text(f"{prog} -o saved.wav \"hello world\"")
            formatter.write_text(f"{prog} -l Chinese \"你好世界\"")
            formatter.write_text(f"{prog} -i \"deep low voice\" \"hello\"")
            formatter.write_text(f"echo \"piped text\" | {prog}")


@click.command(cls=CustomHelpCommand)
@click.argument("text", required=False)
@click.option("-o", "--output", default="output.wav", help="Output filename (default: output.wav)")
@click.option("-l", "--language", default="English", help="Language for TTS (default: English)")
@click.option("-i", "--instruct", default=None, help="Voice instruction (e.g., 'deep low voice')")
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose output")
def main(text: str | None, output: str, language: str, instruct: str | None, verbose: bool):
    """Generate audio using Qwen3-TTS and MLX Audio."""
    # Handle piped input
    if text is None:
        if not sys.stdin.isatty():
            text = sys.stdin.read().strip()
        else:
            raise click.UsageError("No text provided. Pass text as an argument or pipe it via stdin.")
    
    if not text:
        raise click.UsageError("Text cannot be empty.")
    
    # Determine output path
    output_path = Path(output)
    if output == "output.wav":
        # Only auto-increment for default filename
        output_path = get_unique_filename(output_path)
    
    if verbose:
        click.echo(f"Loading model...")
    model = load_model("Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign")
    
    if verbose:
        click.echo(f"Generating audio for: {text[:50]}{'...' if len(text) > 50 else ''}")
    
    # Build generation kwargs
    gen_kwargs = {
        "text": text,
        "language": language,
        "verbose": verbose,
        "instruct": instruct or "",
    }
    
    # Generate with voice description
    results = list(model.generate_voice_design(**gen_kwargs))
    
    audio = results[0].audio
    
    # Save to file
    sf.write(str(output_path), np.array(audio), model.sample_rate)
    if verbose:
        click.echo(f"Audio saved to: {output_path}")


if __name__ == "__main__":
    main()