Source code for mswh.tools.plots

import logging
import os

import numpy as np
import pandas as pd

from plotly.offline import iplot, init_notebook_mode

import plotly.graph_objs as go
import plotly.io as pio

log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)


[docs]class Plot(object):
    """Creates and saves plots to visualize and
    correlate arrays, usually timeseries

    Parameters:

        title: str
            Plot title

        data_headers: list
            A list of labels in the same order as the corresponding data
            If None the labels will be the df column labels, or integer
            indices if a list got provided

        label_h: str
            Horizontal axis label

        label_v: str
            Vertical axis label

        legend: boolean
            Plot the legend or not

        save_image: boolean
            If True saves the created image with either
            a given or default path and filename. Supported
            file types are 'png' and 'pdf', as specified in
            the filename.

        duration_curve: boolean
            If True it sorts the columns (df or arrays)
            and plots the duration_curve, returns a
            duration_curve metric as a real

        outpath: string or '' (for current directory)
            Path to save the png image of the plot

        boxmean: True, False, 'sd', 'Only Mean'

        notebook_mode: boolean
            Plot in the notebook if True

        width: int
            Image width

        height: int
            Image height

        fontsize: int
            Axis label font size

    Returns:

        fig: plotly figure if self.interactive
              else True
    """

    def __init__(
        self,
        title="",
        label_h="Time [h]",
        label_v="Component performance",
        data_headers=None,
        save_image=True,
        legend=True,
        outpath="",
        duration_curve=False,
        boxmode="group",
        notebook_mode=False,
        width=1200,
        height=800,
        fontsize=28,
        legend_x=0.4,
        legend_y=1.0,
        margin_l=200.0,
        margin_b=200.0,
    ):

        self.data_headers = data_headers
        self.save_image = save_image
        self.outpath = outpath
        self.interactive = notebook_mode
        self.duration_curve = duration_curve

        # plot formatting, see
        # https://plot.ly/python/reference/#layout-titlefont
        self.layout = go.Layout(
            font=dict(size=fontsize, family="arial"),
            title=title,
            titlefont=dict(size=fontsize * 1.0, family="arial"),
            xaxis=dict(
                title=label_h,
                titlefont=dict(
                    # family='Courier New, monospace',
                    size=fontsize,
                    color="#7f7f7f",
                ),
                tickfont=dict(size=fontsize * 0.8),
            ),
            yaxis=dict(
                title=label_v,
                titlefont=dict(
                    # family='Courier New, monospace',
                    size=fontsize,
                    color="#7f7f7f",
                ),
                tickfont=dict(size=fontsize * 0.6),
            ),
            showlegend=legend,
            width=width,
            height=height,
            margin=dict(l=margin_l, b=margin_b),
            legend=dict(
                x=legend_x,
                y=legend_y,
                font=dict(family="arial", size=fontsize * 0.8),
            ),
        )

        self.boxlayout = go.Layout(
            font=dict(size=fontsize, family="arial"),
            title=title,
            titlefont=dict(size=fontsize * 1.2, family="arial"),
            xaxis=dict(
                title=label_h,
                titlefont=dict(
                    # family='Courier New, monospace',
                    size=fontsize,
                    color="#7f7f7f",
                ),
                tickfont=dict(size=fontsize * 0.8),
            ),
            yaxis=dict(
                title=label_v,
                titlefont=dict(
                    # family='Courier New, monospace',
                    size=fontsize,
                    color="#7f7f7f",
                ),
                tickfont=dict(size=fontsize * 0.8),
            ),
            showlegend=legend,
            width=width,
            height=height,
            margin=dict(l=margin_l, b=margin_b),
            legend=dict(x=legend_x, y=legend_y),
            boxmode=boxmode,
        )

[docs]    def scatter(self, data, outfile="scatter.png", modes="lines+markers"):
        """Creates a scatter plot

        Parameters:

            data: array/list, pd series, list of arrays/lists, pd df
                Provide a list or arrays/lists or a pandas dataframe.
                The variables should be ordered in pairs such that
                each odd variable in the list/first column in the df
                gets assigned to the horizontal axis, each even
                variable to the vertical axes. Each pair needs
                to have the same length, but pairs can be of
                a different length.

            outfile: str
                Filename, include .png, .png .pdf

            modes: str or list of str
                'markers', 'lines', 'lines + markers' or
                a list of the above to assign to each plot
                (one string in a list for each pair of data)

        Returns:

            fig: plotly figure if self.interactive
                  else True
        """
        # some input format error handling, not exhaustive
        if (isinstance(data, list)) and (len(data) < 2):
            msg = (
                "Provide at least two arrays or columns to"
                "create a scatter plot. Or try Series plot "
                "for a single column of data versus its index."
            )
            log.error(msg)
            raise Exception

        if (isinstance(data, pd.Series)) or (
            (isinstance(data, pd.DataFrame)) and (data.shape[1] == 1)
        ):
            msg = (
                "Provide a dataframe with no less than two"
                "columns. Series plot can plot a single column"
                "against its index."
            )
            log.error(msg)
            raise Exception

        # rectangles the data if passed as a list of lists/arrays
        # if some of the lists/arrays are shorter, the gaps are
        # filled with np.nan
        if isinstance(data, list):
            df_data = pd.DataFrame(
                data=np.empty(
                    (
                        len(max(data, key=len)),
                        len(data),
                    )
                )
                * np.nan
            )

            col_inx = 0
            for i in data:
                if isinstance(i, list):
                    i_list = i
                else:
                    i_list = i.tolist()
                df_data[col_inx] = (
                    i_list
                    + (np.empty(df_data.shape[0] - len(i)) * np.nan).tolist()
                )
                col_inx += 1

            data = df_data.copy()

        if self.duration_curve:
            for col_index in range(0, df_data.shape[1]):
                data.iloc[:, col_index] = (
                    data.iloc[:, col_index].sort_values(ascending=False).values
                )

        if self.data_headers:
            data.columns = self.data_headers

        num_columns = data.shape[1]

        if not isinstance(modes, list):
            list_modes = [modes] * int(num_columns / 2)
        else:
            list_modes = modes

        if num_columns % 2 != 0:
            msg = (
                "Provide an even number of columns,"
                "e.g. [x1, y1, x2, y2, ...]"
            )
            log.error(msg)
            raise Exception

        plot_data = []

        for col_index in range(0, num_columns, 2):
            plot_data.append(
                go.Scatter(
                    x=data.iloc[:, col_index],
                    y=data.iloc[:, col_index + 1],
                    mode=list_modes[int(col_index / 2.0)],
                    name=data.columns[col_index + 1],
                )
            )

        fig = go.Figure(data=plot_data, layout=self.layout)

        if self.save_image:

            if self.outpath == None:
                self.outpath = os.getcwd()
            if not os.path.exists(self.outpath):
                os.makedirs(self.outpath)
            pio.write_image(fig, os.path.join(self.outpath, outfile))

        if self.interactive:
            try:
                iplot(fig)
                return fig
            except:
                log.error("Interactive mode failed.")
                raise Exception

        return True

[docs]    def series(
        self,
        data,
        index_in_a_column=None,
        outfile="series.png",
        modes="lines+markers",
    ):
        """Plots all series data against either the index or the first
        provided series. It can sort the data and plot the duration_curve.

        Parameters:

            data: array/list, pd series, list of arrays/lists, pd df
                Provide an array or a list if plotting a single
                variable. If plotting multiple variables provide
                a list of arrays or a pandas dataframe.

                Horizontal axis corresponds to:

                    * if pd df: the index of the dataframe or the first columns of the dataframe
                    * if list or arrays/lists: a range of array length of the first array/list in the list

                All arrays in the list need to have the same length.

            index_in_a_column: boolean
                Horizontal axis labels
                If None, dataframe index is used, otherwise pass a
                column label for a column (it will not be considered
                as a series to plot)

            outfile: str
                Filename, include .png, .png .pdf

            modes: str or list of str
                'markers', 'lines', 'lines+markers' or
                a list of the above to assign to each column
                of data, excluding the first column if
                index_in_a_column is not None

        Returns:

            fig: plotly figure if self.interactive
                else True
        """
        # rectangles the data if passed as a list of lists/arrays
        # if some of the lists/arrays are shorter, the gaps are
        # filled with np.nan
        if isinstance(data, list):
            df_data = pd.DataFrame(
                data=np.empty(
                    (
                        len(max(data, key=len)),
                        len(data),
                    )
                )
                * np.nan
            )

            col_inx = 0
            for i in data:
                if isinstance(i, np.ndarray):
                    df_data[col_inx] = np.concatenate(
                        (
                            i,
                            (
                                np.empty((1, df_data.shape[0] - len(i)))
                                * np.nan
                            )[0],
                        )
                    )

                if isinstance(i, list):
                    df_data[col_inx] = (
                        i
                        + (np.empty((1, df_data.shape[0] - len(i))) * np.nan)[
                            0
                        ].tolist()
                    )
                col_inx += 1

            data = df_data.copy()

        if index_in_a_column is not None:
            labels_h_axis = data.loc[:, index_in_a_column]
            data = data.drop(columns=[index_in_a_column])
        else:
            labels_h_axis = data.index

        if self.data_headers:
            data.columns = self.data_headers

        num_columns = data.shape[1]

        if not isinstance(modes, list):
            list_modes = [modes] * num_columns
        else:
            list_modes = modes

        if self.duration_curve:
            for col_index in range(0, num_columns):
                data.iloc[:, col_index] = (
                    data.iloc[:, col_index].sort_values(ascending=False).values
                )

        plot_data = []

        for col_index in range(num_columns):
            plot_data.append(
                go.Scatter(
                    x=labels_h_axis,
                    y=data.iloc[:, col_index],
                    mode=list_modes[col_index],
                    name=data.columns[col_index],
                )
            )

        fig = go.Figure(data=plot_data, layout=self.layout)

        if self.save_image:

            if self.outpath == None:
                self.outpath = os.getcwd()
            if not os.path.exists(self.outpath):
                os.makedirs(self.outpath)
            pio.write_image(fig, os.path.join(self.outpath, outfile))

        if self.interactive:
            try:
                iplot(fig)
                return fig
            except:
                log.error("Interactive mode failed.")
                raise Exception

        return True

[docs]    def box(
        self,
        dfs,
        plot_cols=None,
        groupby_cols=None,
        df_cat=None,
        outfile="box.png",
        boxmean=False,
        colors=["#3D9970", "#FF4136", "#FF851B"],
        title="Energy Use",
        boxpoints="outliers",
    ):
        """Creates box plots for the chosen `plot_col` and can
        group plots by the `groupby_col`.

        Parameters:

            dfs: list of dfs

            df_cat: list of str
                Indicator of the category carried by the dfs
                (E.g. the dfs differ by housing type)

            plot_col: list of columns to plot, one from each df in
                dfs. If multiple dfs are passed, the values will be
                shown as groups on the plot

            groupby_cols: list of cols to use as x axis, from each
                df. Use the same column if it has the same elements.
                Use None if x axis category not used

            boxpoints: False, 'all', 'outliers', 'suspectedoutliers'
                See https://plot.ly/python/reference/#box

        Returns:

            fig: plotly figure if self.interactive
                else True
        """

        # Extract y values
        y = dict()
        x = dict()
        trace = dict()
        df_ctg = dict()
        data = list()
        i = 0

        for df in dfs:

            y[i] = df[plot_cols[i]].values.tolist()

            if (groupby_cols is not None) and (groupby_cols[i] is not None):
                x[i] = df[groupby_cols[i]].values.tolist()
            else:
                x[i] = None

            if df_cat[i] is not None:
                df_ctg[i] = df_cat[i]
            else:
                df_ctg[i] = ""

            trace[i] = go.Box(
                y=y[i],
                x=x[i],
                name=plot_cols[i] + " - " + df_ctg[i],
                boxpoints=boxpoints,
                marker=dict(color=colors[i]),
                boxmean=boxmean,
            )

            data.append(trace[i])
            i += 1

        fig = go.Figure(data=data, layout=self.boxlayout)

        if self.save_image:

            if self.outpath == None:
                self.outpath = os.getcwd()
            if not os.path.exists(self.outpath):
                os.makedirs(self.outpath)
            pio.write_image(fig, os.path.join(self.outpath, outfile))

        if self.interactive:
            try:
                iplot(fig)
                return fig
            except:
                log.error("Interactive mode failed.")
                raise Exception

        return True


# size and color modes:
#   .........
#                mode='markers',
#                marker={'size': sz,
#                        'color': colors,
#                        'opacity': 0.6,
#                        'colorscale': 'Viridis'