Skip to content

semlib.sort

Algorithm

Bases: ABC

Abstract base class for sorting algorithms.

Sorting algorithms can be used with sort.

Source code in src/semlib/sort/algorithm/algorithm.py
class Algorithm(ABC):
    """Abstract base class for sorting algorithms.

    Sorting algorithms can be used with [sort][semlib.sort.Sort.sort]."""

    def __init__(self) -> None:
        """Initialize."""

    @abstractmethod
    async def _sort[T](
        self,
        iterable: Iterable[T],
        /,
        *,
        reverse: bool = False,
        comparator: Callable[[T, T], Coroutine[None, None, Order]],
        max_concurrency: int,
    ) -> list[T]: ...

__init__

__init__() -> None

Initialize.

Source code in src/semlib/sort/algorithm/algorithm.py
def __init__(self) -> None:
    """Initialize."""

QuickSort

Bases: Algorithm

Quicksort sorting algorithm.

This algorithm uses the Quicksort method to sort items. This algorithm does not provide theoretical guarantees with noisy pairwise comparisons, but standard sorting algorithms can perform well in practice (Qin et al., 2024) even with noisy pariwise comparisons using LLMs.

This algorithm requires O(n log n) pairwise comparisons on average. If you want higher-quality rankings and can tolerate increased costs and latency, you can consider using the BordaCount algorithm instead.

Source code in src/semlib/sort/algorithm/quicksort.py
class QuickSort(Algorithm):
    """Quicksort sorting algorithm.

    This algorithm uses the [Quicksort](https://en.wikipedia.org/wiki/Quicksort) method to sort items. This algorithm
    does **not** provide theoretical guarantees with noisy pairwise comparisons, but standard sorting algorithms can
    perform well in practice ([Qin et al., 2024](https://aclanthology.org/2024.findings-naacl.97/)) even with noisy
    pariwise comparisons using LLMs.

    This algorithm requires O(n log n) pairwise comparisons on average. If you want higher-quality rankings and can
    tolerate increased costs and latency, you can consider using the [BordaCount][semlib.sort.algorithm.BordaCount]
    algorithm instead.
    """

    @override
    def __init__(self, *, randomized: bool = False):
        """Initialize.

        Args:
            randomized: If `True`, uses a randomized pivot selection strategy. This can help avoid worst-case O(n^2)
                performance on certain inputs, but results may be non-deterministic. If False, always uses the first
                item as the pivot.
        """
        super().__init__()
        self._randomized = randomized

    @override
    async def _sort[T](
        self,
        iterable: Iterable[T],
        /,
        *,
        reverse: bool = False,
        comparator: Callable[[T, T], Coroutine[None, None, Order]],
        max_concurrency: int,
    ) -> list[T]:
        lst = iterable if isinstance(iterable, list) else list(iterable)

        async def quicksort(lst: list[T]) -> list[T]:
            if len(lst) <= 1:
                return lst
            pivot_index = secrets.randbelow(len(lst)) if self._randomized else 0
            pivot = lst[pivot_index]
            less = []
            greater = []
            equal = [pivot]  # to handle "neither" case
            comparisons = await util.gather(
                *(comparator(item, pivot) for i, item in enumerate(lst) if i != pivot_index)
            )
            for i, item in enumerate(lst):
                if i == pivot_index:
                    continue
                comparison = comparisons[i if i < pivot_index else i - 1]
                if comparison == Order.LESS:
                    less.append(item)
                elif comparison == Order.GREATER:
                    greater.append(item)
                else:
                    equal.append(item)
            sort_less, sort_greater = await util.gather(quicksort(less), quicksort(greater))
            return sort_less + equal + sort_greater

        sort_list = await quicksort(lst)
        return sort_list[::-1] if reverse else sort_list

__init__

__init__(*, randomized: bool = False)

Initialize.

Parameters:

Name Type Description Default
randomized bool

If True, uses a randomized pivot selection strategy. This can help avoid worst-case O(n^2) performance on certain inputs, but results may be non-deterministic. If False, always uses the first item as the pivot.

False
Source code in src/semlib/sort/algorithm/quicksort.py
@override
def __init__(self, *, randomized: bool = False):
    """Initialize.

    Args:
        randomized: If `True`, uses a randomized pivot selection strategy. This can help avoid worst-case O(n^2)
            performance on certain inputs, but results may be non-deterministic. If False, always uses the first
            item as the pivot.
    """
    super().__init__()
    self._randomized = randomized

BordaCount

Bases: Algorithm

Borda count sorting algorithm.

This algorithm uses the Borda count method to rank items. The algorithm has good theoretical properties for finding approximate rankings based on noisy pairwise comparisons (Shah and Wainwright, 2018).

This algorithm requires O(n^2) pairwise comparisons. If you want to reduce the number of comparisons (to reduce LLM costs), you can consider using the QuickSort algorithm instead.

This algorithm is carefully implemented so that it has O(n) space complexity.

Source code in src/semlib/sort/algorithm/borda_count.py
class BordaCount(Algorithm):
    """Borda count sorting algorithm.

    This algorithm uses the [Borda count](https://en.wikipedia.org/wiki/Borda_count) method to rank items.  The
    algorithm has good theoretical properties for finding approximate rankings based on noisy pairwise comparisons
    ([Shah and Wainwright, 2018](https://jmlr.org/papers/volume18/16-206/16-206.pdf)).

    This algorithm requires O(n^2) pairwise comparisons. If you want to reduce the number of comparisons (to reduce LLM
    costs), you can consider using the [QuickSort][semlib.sort.algorithm.QuickSort] algorithm instead.

    This algorithm is carefully implemented so that it has O(n) space complexity.
    """

    @override
    async def _sort[T](
        self,
        iterable: Iterable[T],
        /,
        *,
        reverse: bool = False,
        comparator: Callable[[T, T], Coroutine[None, None, Order]],
        max_concurrency: int,
    ) -> list[T]:
        lst = iterable if isinstance(iterable, list) else list(iterable)
        scores: list[int] = [0 for _ in lst]

        async def fn(item: tuple[int, int]) -> None:
            i, j = item
            result_ij, result_ji = await util.gather(comparator(lst[i], lst[j]), comparator(lst[j], lst[i]))
            if result_ij == Order.LESS and result_ji == Order.GREATER:
                scores[j] += 1
                scores[i] -= 1
            elif result_ij == Order.GREATER and result_ji == Order.LESS:
                scores[i] += 1
                scores[j] -= 1

        await foreach(
            fn,
            ((i, j) for i in range(len(lst)) for j in range(i + 1, len(lst))),
            max_concurrency=max(1, max_concurrency // 2),  # because each worker does two comparisons concurrently
        )

        # stable sort
        sort_by_score = sorted([(scores[i], i, lst[i]) for i in range(len(lst))], reverse=reverse)
        return [item for _, _, item in sort_by_score]

__init__

__init__() -> None

Initialize.

Source code in src/semlib/sort/algorithm/algorithm.py
def __init__(self) -> None:
    """Initialize."""

sort_sync

sort_sync[T](
    iterable: Iterable[T],
    /,
    *,
    by: str | None = None,
    to_str: Callable[[T], str] | None = None,
    template: str | Callable[[T, T], str] | None = None,
    task: Task | str | None = None,
    algorithm: Algorithm | None = None,
    reverse: bool = False,
    model: str | None = None,
    max_concurrency: int | None = None,
) -> list[T]

Standalone synchronous version of sort.

Source code in src/semlib/sort/sort.py
def sort_sync[T](
    iterable: Iterable[T],
    /,
    *,
    by: str | None = None,
    to_str: Callable[[T], str] | None = None,
    template: str | Callable[[T, T], str] | None = None,
    task: Task | str | None = None,
    algorithm: Algorithm | None = None,
    reverse: bool = False,
    model: str | None = None,
    max_concurrency: int | None = None,
) -> list[T]:
    """Standalone synchronous version of [sort][semlib.sort.Sort.sort]."""
    sorter = Sort(
        model=model,
        max_concurrency=max_concurrency,
    )
    return asyncio.run(
        sorter.sort(iterable, by=by, to_str=to_str, template=template, task=task, algorithm=algorithm, reverse=reverse)
    )

:::