Source code for anomaly.adtk

import pandas as pd
import matplotlib.pyplot as plt
from adtk.detector import InterQuartileRangeAD



[docs]
class AnomalyDetector:
    """
    A class for detecting anomalies in time series data using ADTK's InterQuartileRangeAD.
    It also provides a visualization of the detected anomalies.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing the time series data.
    date_col : str, default "ORDERDATE"
        The column name containing date/time information. This column will be set as the DataFrame's index.
    value_col : str, default "PROFIT_MARGIN"
        The column on which anomaly detection is performed.
    """

    def __init__(self, df, date_col="ORDERDATE", value_col="PROFIT_MARGIN"):
        # Work on a copy so as not to modify the original DataFrame.
        self.df = df.copy()
        self.date_col = date_col
        self.value_col = value_col

        # Ensure the DataFrame index is set to the date column.
        if self.df.index.name != self.date_col:
            self.df.set_index(self.date_col, inplace=True)

        self.anomalies = None


[docs]
    def detect(self):
        """
        Detect anomalies in the specified value column using InterQuartileRangeAD.

        Returns
        -------
        anomalies : pd.DataFrame
            A DataFrame that includes:
              - 'is_anamoly': a boolean flag indicating whether an anomaly was detected.
              - 'value': the original value from the input DataFrame.
        """
        # Initialize the ADTK detector.
        detector = InterQuartileRangeAD()
        # Run the detection on the selected column.
        detection_df = self.df[[self.value_col]]
        detected = detector.fit_detect(detection_df)

        # Prepare the anomalies DataFrame:
        anomalies = detected.copy()
        # Add the actual values (for plotting purposes)
        anomalies["value"] = detection_df[self.value_col]
        # Rename the detector output column for clarity.
        anomalies.rename(columns={self.value_col: "is_anamoly"}, inplace=True)

        self.anomalies = anomalies
        return anomalies



[docs]
    def get_anomaly_dates(self):
        """
        Retrieve the dates where anomalies were detected.

        Returns
        -------
        anomaly_dates : pd.DatetimeIndex
            The dates (from the DataFrame's index) where an anomaly was detected.
        """
        if self.anomalies is None:
            raise ValueError(
                "Anomalies have not been detected yet. Please call detect() first."
            )

        anomaly_dates = self.anomalies[self.anomalies["is_anamoly"]].index

        return pd.Series(anomaly_dates, name=anomaly_dates.name)



[docs]
    def visualize(
        self,
        figsize=(12, 6),
        title="Daily Profit Margin Over Time with Anomalies Highlighted",
        xlabel="Date",
        ylabel="Profit Margin Value",
        ylim=(40, 60),
    ):
        """
        Visualize the time series data with anomalies highlighted.

        Parameters
        ----------
        figsize : tuple, default (12, 6)
            Size of the figure.
        title : str, default "Daily Profit Margin Over Time with Anomalies Highlighted"
            Title for the plot.
        xlabel : str, default "Date"
            Label for the x-axis.
        ylabel : str, default "Profit Margin Value"
            Label for the y-axis.
        ylim : tuple, optional, default (40, 60)
            Y-axis limits.
        """
        if self.anomalies is None:
            raise ValueError(
                "Anomalies have not been detected yet. Please call detect() first."
            )

        plt.figure(figsize=figsize)

        # Plot the original time series.
        plt.plot(
            self.anomalies.index,
            self.anomalies["value"],
            label="Profit Margin",
            color="black",
        )

        # Highlight the anomaly points.
        anomaly_points = self.anomalies[self.anomalies["is_anamoly"]]
        plt.scatter(
            anomaly_points.index,
            anomaly_points["value"],
            color="red",
            label="Anomaly Detected",
            s=100,
        )

        # Set plot labels and grid.
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.legend()
        plt.grid(True, linestyle="--", linewidth=0.5, alpha=0.4)
        if ylim is not None:
            plt.ylim(ylim)

        plt.show()