Source code for pybe.benchmark

from typing import List, Callable, Dict, Union, Optional
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp


[docs]class Benchmark:
    """Benchmark any Python function

    pybe.Benchmark allows you to:

    * **benchmark** any Python function (with vectors of real numbers as output)

    * **store** the results in a csv (default) or excel file and

    * **read** from previous benchmark results.

    .. epigraph::
        **How it works:**
        Specify a list of inputs and apply a given function to those inputs a specified number of times
    """

    def __init__(self,
                 benchmark_csv_file_path: Optional[str] = None,
                 ):
        if benchmark_csv_file_path is not None:
            self.read_from_csv(benchmark_csv_file_path)

[docs]    def __call__(self,
                 function: Callable[..., Dict[str, float]],
                 inputs: List[Union[str, float]],
                 name: str,
                 number_runs: int = 10,
                 store: bool = True,
                 parallel: bool = False,
                 ):
        """Benchmark a function


        Parameters
        ----------
        function : Callable[..., Dict[str, float]]
            function to be benchmarked which returns a dictionary with keys the _name of the output (string)
            and value the value of the function (float)

        inputs : List[Union[str, float]]
            inputs on which the function is to be benchmarked stored as a list of strings or floats

        number_runs : int
            number of runs for each inputs

        store : bool
            if true, store the output of the benchmark as a benchmark.yaml

        parallel : bool
            if true, run the benchmark on parallel (using multiprocessing)


        Examples
        --------
        Initialize the benchmark class.

        >>> benchmark = Benchmark()

        Define the function to be benchmarked. This function must take a single argument (float or string) and return
        a dictionary where each key represents one output

        >>> def test_function(i: int):
        >>>     return {"value": i}

        Define the list of inputs

        >>> inputs = [1, 2, 3]

        Specify the number of runs

        >>> number_runs=3

        Run the benchmark

        >>> benchmark(function=test_function,
        ...           name="test-benchmark",
        ...           inputs=inputs,
        ...           number_runs=number_runs)
        """

        self._name = name
        self._inputs = inputs

        # TODO: test what if dict contains numpy array or others...
        if parallel:
            for x in tqdm(inputs):
                # TODO: doesnt work for generating instances of classes
                try:
                    with mp.Pool() as pool:
                        results_x = pool.map(function, [x for _ in range(number_runs)])
                    # re-arrange results
                    self._result[x] = {key: [value[key] for value in results_x] for key in results_x[0].keys()}

                except RuntimeError:
                    print("benchmark(test_function) needs to be inside the if __name__ == '__main__': "
                          'clause to prevent spawning infinite processes.')
                    break

        else:
            self._result = []
            for x in tqdm(inputs):
                for _ in range(number_runs):
                    result_x = function(x)
                    result_x['Input'] = x
                    result_x['Name'] = name
                    self._result.append(result_x)
        self._name_outputs = list(result_x.keys())
        self._result = pd.concat([pd.DataFrame(result, index=[0]) for result in self._result], ignore_index=True)

        if store:
            self.to_csv(name)

[docs]    def to_excel(self, name: str = 'benchmark'):
        """Save results to excel (xlsx) file (file path is file path of script)

        Parameters
        ----------
        name : str
            name of the benchmark
        """
        self.result.to_excel(f'{name}.xlsx')

[docs]    def to_csv(self, name: str = 'benchmark'):
        """Save results to csv file (file path is file path of script)

        Parameters
        ----------
        name : str
            name of the benchmark
        """
        self.result.to_csv(f'{name}.csv', index=False)

    @property
    def result(self) -> pd.DataFrame:
        """Return the result of the benchmark as a pandas DataFrame

        Returns
        -------
        pd.DataFrame
            result of the benchmark

        """
        return self._result

[docs]    def read_from_csv(self, benchmark_csv_file_path: str):
        """Read previous results of corresponding yaml file and store them in this instance

        Parameters
        ----------
        benchmark_csv_file_path : str
            path of benchmark yaml file

        Examples
        --------
        >>> benchmark = Benchmark() # initialize benchmark instance
        >>> benchmark.read_from_csv(benchmark_csv_file_path="./benchmark.csv") # read results
        >>> print(benchmark.result) # print result
        """
        self._result = pd.read_csv(benchmark_csv_file_path)

[docs]    def return_outputs(self, input: float):
        return self.result.loc[self.result['Input'] == input]

    @property
    def inputs(self) -> List[Union[str, float]]:
        """Return the list of inputs of the benchmark

        Returns
        -------
        List[Union[str, float]]
            inputs of the benchmark

        """
        return list(set(self.result['Input'].values))

    @property
    def name_outputs(self) -> List[str]:
        """Return the list of names of outputs of the benchmark

        Returns
        -------
        List[str]
            name of outputs

        """
        return list(self.result.columns.drop(['Input', 'Name']))

    @property
    def means(self) -> pd.DataFrame:
        """Return the means of the outputs as pandas DataFrame

        Returns
        -------
        pd.DataFrame
            means of the benchmark

        """
        means = self.result.groupby(['Input']).mean(numeric_only=True).reset_index()
        means['Name'] = self.name
        return means

    @property
    def std(self) -> pd.DataFrame:
        """Return the standard deviation of the outputs as pandas DataFrame

        Returns
        -------
        pd.DataFrame
            std of the benchmark

        """
        std = self.result.groupby(['Input']).std(numeric_only=True).reset_index()
        std['Name'] = self.name
        return std

    @property
    def name(self) -> str:
        """Return the name of the benchmark

        Returns
        -------
        str
            name of the benchmark

        """
        return self.result['Name'].values[0]