Source code for clusx.cli

"""Command-line interface for the Clusterium.

This module provides a command-line interface for clustering text data,
benchmarking clustering results, and generating reports. It handles command-line
arguments, environment configuration, and execution of the appropriate toolkit
functionality based on user commands.

"""

from __future__ import annotations

import os
import sys
from pathlib import Path
from typing import TYPE_CHECKING

import click

if TYPE_CHECKING:
    from collections.abc import Callable
    from typing import Optional

from .errors import ClusterIntegrityError, EvaluationError
from .logging import get_logger, setup_logging
from .version import __copyright__, __version__

logger = get_logger(__name__)

# Set up paths
BASE_DIR = Path(__file__).resolve().parent.parent
OUTPUT_DIR = BASE_DIR / "output"

BANNER = r"""


              ████
             ░░███
      ██████  ░███  █████ ████  █████  █████ █████
     ███░░███ ░███ ░░███ ░███  ███░░  ░░███ ░░███
    ░███ ░░░  ░███  ░███ ░███ ░░█████  ░░░█████░
    ░███  ███ ░███  ░███ ░███  ░░░░███  ███░░░███
    ░░██████  █████ ░░████████ ██████  █████ █████
     ░░░░░░  ░░░░░   ░░░░░░░░ ░░░░░░  ░░░░░ ░░░░░



"""


[docs] class RichGroup(click.Group): """Custom Click group that displays a banner before the help text.""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs)
[docs] def format_help(self, ctx, formatter): """Writes the help into the formatter if it exists. This method is called by Click when the help text is requested. """ click.secho(BANNER, nl=False) super().format_help(ctx, formatter)
[docs] def common_options(func: Callable) -> Callable: """Common options for all clusx CLI commands.""" func = click.option( "--output-dir", type=click.Path(exists=True, file_okay=False, dir_okay=True), default=str(OUTPUT_DIR), help="Directory to save output files (default: ./output)", )(func) return func
@click.group( help="Text Clustering Toolkit for Bayesian Nonparametric Analysis", cls=RichGroup, ) @click.version_option( version=__version__, prog_name="clusx", message=f"""%(prog)s %(version)s {__copyright__} This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.""", ) def cli(): """Text Clustering Toolkit for statistical analysis and benchmarking.""" @cli.command(help="Cluster text data using various Bayesian nonparametric methods") @common_options @click.option( "--input", "input_", type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), help="Path to the input file (text file or CSV)", required=True, ) @click.option( "--output", default="clusters_output.csv", show_default=True, help="CSV file with clustering results", ) @click.option( "--dp-alpha", default=0.5, show_default=True, type=float, help=( "Concentration parameter for Dirichlet Process (α > 0, " "typical values 0.1-10). Note: DP only uses α." ), ) @click.option( "--dp-kappa", default=0.3, show_default=True, type=float, help="Kappa parameter for Dirichlet likelihood model", ) @click.option( "--pyp-alpha", default=0.3, show_default=True, type=float, help=( "Concentration parameter for Pitman-Yor Process (α > -σ). " "Using same α value as DP leads to dramatically clustering behaviors." ), ) @click.option( "--pyp-kappa", default=0.3, show_default=True, type=float, help="Kappa parameter for Pitman-Yor likelihood model", ) @click.option( "--pyp-sigma", default=0.3, show_default=True, type=float, help=( "Discount parameter for Pitman-Yor Process (0.0 ≤ σ < 1.0). " "PYP uses both α and σ parameters." ), ) @click.option( "--random-seed", default=None, show_default=True, type=int, help="Random seed for reproducible clustering", ) @click.option( "--column", default=None, help="Column name to use for clustering (required for CSV files)", ) def cluster( input_: str, output: str, output_dir: str, dp_alpha: float, dp_kappa: float, pyp_alpha: float, pyp_kappa: float, pyp_sigma: float, random_seed: Optional[int], column: Optional[str], ): """Cluster text data using Dirichlet Process and Pitman-Yor Process.""" from .clustering import ( DirichletProcess, PitmanYorProcess, ) from .clustering.utils import ( load_data, save_clusters_to_csv, save_clusters_to_json, ) try: os.makedirs(output_dir, exist_ok=True) logger.debug( "Loading data from %s%s...", input_, ", using column '" + column + "' " if column else "", ) texts = load_data(input_, column) _validate_dataset(texts) logger.info("Loaded %d texts for clustering", len(texts)) logger.info("Performing Dirichlet Process clustering...") dp = DirichletProcess( alpha=dp_alpha, kappa=dp_kappa, random_state=random_seed, ) clusters_dp = dp.fit_predict(texts) logger.info( "DP clustering complete. Found %d clusters", len(set(dp.cluster_params)) ) logger.info("Performing Pitman-Yor Process clustering...") pyp = PitmanYorProcess( alpha=pyp_alpha, kappa=pyp_kappa, sigma=pyp_sigma, random_state=random_seed, ) clusters_pyp = pyp.fit_predict(texts) logger.info( "PYP clustering complete. Found %d clusters", len(set(clusters_pyp)) ) # Save results output_basename = os.path.basename(output) # Save CSV files dp_output = os.path.join(output_dir, output_basename.replace(".csv", "_dp.csv")) pyp_output = os.path.join( output_dir, output_basename.replace(".csv", "_pyp.csv") ) save_clusters_to_csv( dp_output, texts, clusters_dp, "DP", alpha=dp_alpha, sigma=0.0, kappa=dp_kappa, ) save_clusters_to_csv( pyp_output, texts, clusters_pyp, "PYP", alpha=pyp_alpha, sigma=pyp_sigma, kappa=pyp_kappa, ) # Save JSON files dp_json = os.path.join(output_dir, output_basename.replace(".csv", "_dp.json")) pyp_json = os.path.join( output_dir, output_basename.replace(".csv", "_pyp.json") ) save_clusters_to_json( dp_json, texts, clusters_dp, "DP", alpha=dp_alpha, sigma=0.0, kappa=dp_kappa, ) save_clusters_to_json( pyp_json, texts, clusters_pyp, "PYP", alpha=pyp_alpha, sigma=pyp_sigma, kappa=pyp_kappa, ) except Exception as err: # pylint: disable=broad-except logger.exception(err) # Unexpected error sys.exit(1) @cli.command(help="Evaluate and compare clustering results") @common_options @click.option( "--input", "input_", type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), help="Path to the input file (text file or CSV)", required=True, ) @click.option( "--dp-clusters", type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), help="Path to Dirichlet Process clustering results CSV", required=True, ) @click.option( "--pyp-clusters", type=click.Path(exists=True, file_okay=True, dir_okay=False, readable=True), help="Path to Pitman-Yor Process clustering results CSV", required=True, ) @click.option( "--plot/--no-plot", default=True, show_default=True, help="Generate evaluation plots", ) @click.option( "--show-plot/--no-show-plot", default=False, show_default=True, help="Display plots interactively (not recommended for automated runs)", ) @click.option( "--random-seed", default=None, show_default=True, type=int, help="Random seed for reproducible evaluation", ) @click.option( "--column", default=None, help="Column name to use for clustering (required for CSV files)", ) def evaluate( input_: str, dp_clusters: str, pyp_clusters: str, output_dir: str, plot: bool, show_plot: bool, random_seed: Optional[int], column: Optional[str], ): """Evaluate clustering results using established metrics.""" from .clustering.utils import ( get_embeddings, load_cluster_assignments, load_data, ) from .evaluation import ( ClusterEvaluator, save_evaluation_report, ) try: os.makedirs(output_dir, exist_ok=True) logger.debug( "Loading data from %s%s...", input_, ", using column '" + column + "' " if column else "", ) texts = load_data(input_, column) _validate_dataset(texts) logger.info("Loaded %d texts for evaluation", len(texts)) # Load cluster assignments logger.debug("Loading DP cluster assignments from %s...", dp_clusters) dp_cluster_assignments, dp_params = load_cluster_assignments(dp_clusters) logger.debug("Loading PYP cluster assignments from %s...", pyp_clusters) pyp_cluster_assignments, pyp_params = load_cluster_assignments(pyp_clusters) embeddings = get_embeddings(texts) # Evaluate DP clusters logger.info("Evaluating Dirichlet Process clustering...") dp_evaluator = ClusterEvaluator( texts, embeddings, dp_cluster_assignments, "Dirichlet", alpha=dp_params["alpha"], sigma=dp_params["sigma"], kappa=dp_params["kappa"], random_state=random_seed, ) dp_report = dp_evaluator.generate_report() # Evaluate PYP clusters logger.info("Evaluating Pitman-Yor Process clustering...") pyp_evaluator = ClusterEvaluator( texts, embeddings, pyp_cluster_assignments, "Pitman-Yor", alpha=pyp_params["alpha"], sigma=pyp_params["sigma"], kappa=pyp_params["kappa"], random_state=random_seed, ) pyp_report = pyp_evaluator.generate_report() reports = { "Dirichlet": dp_report, "Pitman-Yor": pyp_report, } save_evaluation_report(reports, output_dir) if plot: from .visualization import visualize_evaluation_dashboard logger.info("Generating evaluation dashboard...") visualize_evaluation_dashboard(reports, output_dir, show_plot=show_plot) if show_plot: logger.info("Close the plot window to continue.") logger.info("Evaluation complete.") except (ClusterIntegrityError, EvaluationError) as error: logger.error(error) sys.exit(1) except Exception as error: # pylint: disable=broad-except logger.exception(error) # Unexpected error sys.exit(1) def _validate_dataset(texts): """ Validates the input dataset for clustering and provides appropriate warnings. Checks if the dataset is empty or too small for effective Bayesian nonparametric clustering. Displays color-coded warnings based on severity or raises an error if the dataset is empty. Args: texts: List of texts to be clustered """ if not texts: raise click.ClickException("No data found in the provided source.") if len(texts) < 10: click.echo( click.style( "Warning: Dataset is very small (< 10 texts). " "Some evaluation metrics and visualizations may not be available " "or meaningful.", fg="yellow", bold=True, ) ) return if len(texts) <= 2: click.echo( click.style( "Critical: Dataset has only 1-2 texts. " "Most evaluation metrics require at least 3 texts. " "Consider using a larger dataset for meaningful evaluation.", fg="red", bold=True, ) )
[docs] def main(args: Optional[list[str]] = None) -> int: """ Main entry point for the clusx CLI. Args: args: Command line arguments (uses :py:data:`sys.argv` if None) Returns: int: Exit code (0 for success, non-zero for errors) """ # Set up logging setup_logging() try: # Invoke the Click command cli.main(args=args, standalone_mode=False) return 0 except click.exceptions.Abort: # Handle keyboard interrupts gracefully logger.warning("Operation aborted by user") return 130 # Standard exit code for SIGINT except click.exceptions.Exit as e: # Handle normal exit return e.exit_code except Exception as exc: # pylint: disable=broad-exception-caught # Handle unexpected errors logger.error(exc) return 1