Source code for votekit.cvr_loaders.load_ranking_csv

import pandas as pd
from typing import Optional, Union
import os
from pathlib import Path
import numpy as np
from votekit.pref_profile import RankProfile
import warnings


def __validate_df_contains_column_idxs(
    df: pd.DataFrame, col_idxs: list[int] | int, *, error_label: str
):
    """
    Validate that the dataframe contains the provided column indices.

    Args:
        df (pd.DataFrame): Dataframe to check.
        col_idxs (list[int] | int): The column indices to check. Can be a list or singleton.
        error_label (str): The type of column being validated.

    Raises:
        ValueError: If the column index is not in the dataframe.

    """
    if not isinstance(col_idxs, list):
        col_idxs = [col_idxs]

    if any(idx < 0 or idx > len(df.columns) - 1 for idx in col_idxs):
        for idx in col_idxs:
            if idx < 0 or idx > len(df.columns) - 1:
                raise ValueError(
                    f"{error_label} column index {idx} must be in [0, {len(df.columns) -1}] "
                    "because Python is 0-indexed."
                )


def __validate_weight_col_values(df: pd.DataFrame, weight_col: int):
    """
    Validate that the weight column has no nan values and all values can be cast to float.

    Args:
        df (pd.DataFrame): The dataframe to validate.
        weight_col (int): The index of the weight column.


    Raises:
        ValueError: If the weight column contains a nan value.
        ValueError: If the weight column contains a value that cannot be cast to float.

    """
    if df[weight_col].isna().any():
        for idx, weight in df[weight_col].items():
            if np.isnan(weight):
                raise ValueError(f"No weight provided in row {idx}.")

    try:
        df[weight_col].astype(float)
    except ValueError:

        for idx, weight in df[weight_col].items():
            try:
                float(weight)
            except ValueError:
                raise ValueError(
                    f"Weight {weight} in row {idx} must be able to be cast to float."
                )


def __validate_distinct_cols(
    rank_cols: list[int], id_col: Optional[int], weight_col: Optional[int]
):
    """
    Validate that the column indices are distinct.

    Args:
        rank_cols (list[int]): The list of ranking column indices.
        id_col (Optional[int]): The index of the Voter ID column.
        weight_col (Optional[int]): The index of the Weight column.

    Raises:
        ValueError: If the id column index is in rank_cols.
        ValueError: If the weight column index is in rank_cols.

    """

    if id_col is not None and id_col in rank_cols:
        raise ValueError(
            f"ID column {id_col} must not be a ranking column {rank_cols}."
        )

    if weight_col is not None and weight_col in rank_cols:
        raise ValueError(
            f"Weight column {weight_col} must not be a ranking column {rank_cols}."
        )


def __validate_columns(
    df: pd.DataFrame,
    rank_cols: list[int],
    id_col: Optional[int],
    weight_col: Optional[int],
):
    """
    Validate that the columns of the csv are correctly formatted.

    Args:
        df (pd.DataFrame): The dataframe of the csv.
        rank_cols (list[int]): The list of ranking column indices.
        id_col (Optional[int]): The index of the Voter ID column.

    Raises:
        ValueError: If a column index is not in the dataframe.
        ValueError: If both the weight and id column are provided.
        ValueError: If the id column index is in rank_cols.
        ValueError: If the weight column index is in rank_cols.
        ValueError: If the weight column contains a nan value.
        ValueError: If the weight column contains a value that cannot be cast to float.

    """

    if weight_col and id_col:
        raise ValueError(
            "Only one of weight_col and id_col can be provided; you cannot have an ID"
            " column if the weight of each ballot is anything but 1."
        )

    __validate_distinct_cols(rank_cols, id_col, weight_col)

    __validate_df_contains_column_idxs(df, rank_cols, error_label="Ranking")

    if id_col is not None:
        __validate_df_contains_column_idxs(df, id_col, error_label="ID")

    if weight_col is not None:
        __validate_df_contains_column_idxs(df, weight_col, error_label="Weight")
        __validate_weight_col_values(df, weight_col)

    return rank_cols, id_col, weight_col


def __format_ranking_cols(
    mutated_df: pd.DataFrame, rank_cols: list[str]
) -> pd.DataFrame:
    """
    Formats the datatype of the ranking columns.

    Args:
        mutated_df (pd.DataFrame): The dataframe of the csv.
        rank_cols (list[str]): The list of ranking column names.

    Returns:
        pd.DataFrame: The mutated dataframe.

    """
    mutated_df[rank_cols] = mutated_df[rank_cols].astype(
        object
    )  # ensure object dtype for sets

    def _format_row(row: pd.Series) -> pd.Series:
        vals = row.to_list()
        out: list[frozenset[str]] = []
        for i, candidate in enumerate(vals):
            if isinstance(candidate, str):
                out.append(frozenset({candidate}))
            elif any(isinstance(c, str) for c in vals[i + 1 :]):
                out.append(frozenset())  # empty set
            else:
                out.append(frozenset({"~"}))  # explicit tilde marker
        return pd.Series(out, index=row.index)

    mutated_df[rank_cols] = mutated_df[rank_cols].apply(_format_row, axis=1)
    return mutated_df


def __format_df(
    mutated_df: pd.DataFrame,
    rank_cols: list[int],
    id_col: Optional[int],
    weight_col: Optional[int],
) -> tuple[pd.DataFrame, list[str]]:
    """
    Formats the column names and datatypes.

    Args:
        mutated_df (pd.DataFrame): The dataframe of the csv.
        rank_cols (list[int]): The list of ranking column indices.
        id_col (int): The index of the ID column.
        weight_col (int): The index of the weight column.


    Returns:
        tuple[pd.DataFrame, list[str]]: The mutated dataframe and new ranking column names.

    """
    mutated_df = mutated_df.copy()
    renamed_columns = {r_col: f"Ranking_{i+1}" for i, r_col in enumerate(rank_cols)}

    if weight_col is not None:
        renamed_columns.update({weight_col: "Weight"})
    else:
        mutated_df["Weight"] = 1

    if id_col is not None:
        renamed_columns.update({id_col: "Voter Set"})
    else:
        mutated_df["Voter Set"] = [set() for _ in range(len(mutated_df))]

    mutated_df.rename(columns=renamed_columns, inplace=True)
    mutated_df.index.name = "Ballot Index"

    str_rank_cols = [c for c in mutated_df.columns if str(c).startswith("Ranking_")]
    mutated_df = mutated_df[str_rank_cols + ["Voter Set", "Weight"]]
    mutated_df = __format_ranking_cols(mutated_df, str_rank_cols)

    mutated_df["Weight"] = mutated_df["Weight"].astype(float)

    mutated_df["Voter Set"] = mutated_df["Voter Set"].map(
        lambda x: {x} if not isinstance(x, set) else x
    )
    return mutated_df, str_rank_cols


def __find_and_validate_cands(
    df: pd.DataFrame, rank_cols: list[str], candidates: Optional[list[str]]
) -> list[str]:
    """
    Finds the candidates in the csv and validates that they match the provided list.

    Args:
        df (pd.DataFrame): The dataframe of the csv.
        rank_cols (list[str]): The list of ranking column labels.
        candidates (Optional[list[str]]): The list of candidates.


    Returns:
        list[str]: The list of candidates found.

    """
    candidates_found: set[str] = set()

    sets = df[rank_cols].to_numpy().ravel()
    candidates_found |= set().union(*sets)

    candidates_found.discard("~")

    if candidates is None:
        candidates = list(candidates_found)

    else:
        if any(c not in candidates_found for c in candidates):
            for c in candidates:
                if c not in candidates_found:
                    raise ValueError(
                        f"Candidate {c} was provided in candidates {candidates} but "
                        "not found in the csv."
                    )

        if any(c not in candidates for c in candidates_found):
            for c in candidates_found:
                if c not in candidates:
                    raise ValueError(
                        f"Candidate {c} was found in the csv but not provided in "
                        f"candidates {candidates}."
                    )

    return candidates



[docs]
def load_ranking_csv(
    path_or_url: Union[str, os.PathLike, Path],
    rank_cols: list[int],
    *,
    weight_col: Optional[int] = None,
    id_col: Optional[int] = None,
    candidates: Optional[list[str]] = None,
    delimiter: str = ",",
    header_row: Optional[int] = None,
    print_profile: bool = True,
) -> RankProfile:
    """
    Given a file path or url, loads ranked cast vote record (cvr) with ranks as columns and
    voters as rows.

    Args:
        path_or_url (Union[str, os.PathLike, pathlib.Path]): Path or url to cvr file.
        rank_cols (list[int]): List of column indices that contain rankings. Column indexing
            starts from 0, in order from top to bottom rank.
        weight_col (Optional[int]): The column position for ballot weights. Defaults to None, which
            implies each row has weight 1. Cannot be provided if ``id_col`` is also provided.
        id_col (Optional[int]): Index for the column with voter ids. Defaults to None.
            Cannot be provided if ``weight_col`` is also provided.
        candidates (Optional[list[str]]): List of candidate names. Defaults to None, in which case
            names are inferred from the CVR.
        delimiter (Optional[str]): The character that separates entries. Defaults to a comma.
        header_row (Optional[int]): The row containing the column names, below which the data
            begins. Defaults to None, in which case row 0 is considered to be the first ballot.
        print_profile (bool): Whether or not to print the loaded profile. Defaults to True. Useful
            for debugging.


    Raises:
        FileNotFoundError: CSV cannot be found. Raised by ``pandas.read_csv``.
        URLError: Invalid url. Raised by ``pandas.read_csv``.
        HTTPError: URL is valid but other failure occurs. Raised by ``pandas.read_csv``.
        ParserError: Pandas fails to read the csv. Raised by ``pandas.read_csv``.
        UnicodeDecodeError: Bad encoding. Raised by ``pandas.read_csv``.
        ValueError: Candidates provided but they do not exist in the CSV.
        ValueError: Candidates provided but extra candidates are found in the CSV.
        ValueError: Only one of weight_col or id_col can be provided.
        ValueError: weight_col or id_col are not distinct from rank_cols.
        ValueError: weight_col, id_col, and each entry of rank_cols must be non-negative and
            within the number of columns of the csv.
        ValueError: If weight_col is provided, weights must be non-empty and convertible to
            float.
        ValueError: Header must be non-negative.


    Returns:
        RankProfile: A ``RankProfile`` that represents all the ballots in the csv.
    """
    path_or_url = str(path_or_url)

    if header_row is not None and header_row < 0:
        raise ValueError(f"Header row {header_row} must be non-negative.")

    df = pd.read_csv(
        path_or_url,
        on_bad_lines="error",
        encoding="utf8",
        index_col=False,
        delimiter=delimiter,
        header=header_row,
    )

    df.columns = pd.Index(range(len(df.columns)))

    rank_cols, id_col, weight_col = __validate_columns(
        df, rank_cols, id_col, weight_col
    )

    df, str_rank_cols = __format_df(df, rank_cols, id_col, weight_col)

    candidates = __find_and_validate_cands(df, str_rank_cols, candidates)

    profile = RankProfile(
        df=df,
        max_ranking_length=len(str_rank_cols),
        candidates=tuple(candidates),
    )

    if print_profile:
        print(profile)

    return profile




[docs]
def load_csv(
    path_or_url: str,
    rank_cols: list[int],
    *,
    weight_col: Optional[int] = None,
    id_col: Optional[int] = None,
    candidates: Optional[list[str]] = None,
    delimiter: str = ",",
    header_row: Optional[int] = None,
    print_profile: bool = True,
) -> RankProfile:
    """
    Given a file path or url, loads ranked cast vote record (cvr) with ranks as columns and
    voters as rows.

    Args:
        path_or_url (str): Path or url to cvr file.
        rank_cols (list[int]): List of column indices that contain rankings. Column indexing
            starts from 0, in order from top to bottom rank.
        weight_col (Optional[int]): The column position for ballot weights. Defaults to None, which
            implies each row has weight 1. Cannot be provided if ``id_col`` is also provided.
        id_col (Optional[int]): Index for the column with voter ids. Defaults to None.
            Cannot be provided if ``weight_col`` is also provided.
        candidates (Optional[list[str]]): List of candidate names. Defaults to None, in which case
            names are inferred from the CVR.
        delimiter (Optional[str]): The character that separates entries. Defaults to a comma.
        header_row (Optional[int]): The row containing the column names, below which the data
            begins. Defaults to None, in which case row 0 is considered to be the first ballot.
        print_profile (bool): Whether or not to print the loaded profile. Defaults to True. Useful
            for debugging.


    Raises:
        FileNotFoundError: CSV cannot be found. Raised by ``pandas.read_csv``.
        URLError: Invalid url. Raised by ``pandas.read_csv``.
        HTTPError: URL is valid but other failure occurs. Raised by ``pandas.read_csv``.
        ParserError: Pandas fails to read the csv. Raised by ``pandas.read_csv``.
        UnicodeDecodeError: Bad encoding. Raised by ``pandas.read_csv``.
        ValueError: Candidates provided but they do not exist in the CSV.
        ValueError: Candidates provided but extra candidates are found in the CSV.
        ValueError: Only one of weight_col or id_col can be provided.
        ValueError: weight_col or id_col are not distinct from rank_cols.
        ValueError: weight_col, id_col, and each entry of rank_cols must be non-negative and
            within the number of columns of the csv.
        ValueError: If weight_col is provided, weights must be non-empty and convertible to
            float.
        ValueError: Header must be non-negative.


    Returns:
        RankProfile: A ``RankProfile`` that represents all the ballots in the csv.
    """
    warnings.warn(
        "This function is being deprecated in March 2026. The correct function call is "
        "now load_ranking_csv.",
        DeprecationWarning,
    )

    return load_ranking_csv(
        path_or_url,
        rank_cols,
        weight_col=weight_col,
        id_col=id_col,
        candidates=candidates,
        delimiter=delimiter,
        header_row=header_row,
        print_profile=print_profile,
    )