/*
 * linux/kernel/irq/timings.c
 *
 * Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 */
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/static_key.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/idr.h>
#include <linux/irq.h>
#include <linux/math64.h>

#include <trace/events/irq.h>

#include "internals.h"

DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);

DEFINE_PER_CPU(struct irq_timings, irq_timings);

struct irqt_stat {
	u64 ne;         /* next event                               */
	u64 lts;	/* last timestamp                           */
	u64 variance;	/* variance                                 */
	u32 avg;	/* mean value                               */
	u32 count;      /* number of samples                        */
	int anomalies;  /* number of consecutives anomalies         */
	int valid;      /* behaviour of the interrupt               */
};

static DEFINE_IDR(irqt_stats);

void irq_timings_enable(void)
{
	static_branch_enable(&irq_timing_enabled);
}

void irq_timings_disable(void)
{
	static_branch_disable(&irq_timing_enabled);
}

/*
 * Number of elements in the circular buffer: If it happens it was
 * flushed before, then the number of elements could be smaller than
 * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
 * used as we wrapped. The index begins from zero when we did not
 * wrap. That could be done in a nicer way with the proper circular
 * array structure type but with the cost of extra computation in the
 * interrupt handler hot path. We choose efficiency.
 *
 * Inject measured irq/timestamp to the statistical model while
 * decrementing the counter because we consume the data from our
 * circular buffer.
 */
#define for_each_irqts(__iter, __irqts)					\
	for (__iter = __irqts->count < IRQ_TIMINGS_SIZE ?		\
		     0 :__irqts->count & IRQ_TIMINGS_MASK,		\
		     __irqts->count = min(IRQ_TIMINGS_SIZE, __irqts->count); \
	     __irqts->count > 0; __irqts->count--,			\
		     __iter = (__iter + 1) & IRQ_TIMINGS_MASK)

/**
 * irqs_update - update the irq timing statistics with a new timestamp
 *
 * @irqs: an irqt_stat struct pointer
 * @ts: the new timestamp
 *
 * ** This function must be called with the local irq disabled **
 *
 * The statistics are computed online, in other words, the code is
 * designed to compute the statistics on a stream of values rather
 * than doing multiple passes on the values to compute the average,
 * then the variance. The integer division introduces a loss of
 * precision but with an acceptable error margin regarding the results
 * we would have with the double floating precision: we are dealing
 * with nanosec, so big numbers, consequently the mantisse is
 * negligeable, especially when converting the time in usec
 * afterwards.
 *
 * The computation happens at idle time. When the CPU is not idle, the
 * interrupts' timestamps are stored in the circular buffer, when the
 * CPU goes idle and this routine is called, all the buffer's values
 * are injected in the statistical model continuying to extend the
 * statistics from the previous busy-idle cycle.
 *
 * The observations showed a device will trigger a burst of periodic
 * interrupts followed by one or two peaks of longer time, for
 * instance when a SD card device flushes its cache, then the periodic
 * intervals occur again. A one second inactivity period resets the
 * stats, that gives us the certitude the statistical values won't
 * exceed 1x10^9, thus the computation won't overflow.
 *
 * Basically, the purpose of the algorithm is to watch the periodic
 * interrupts and eliminate the peaks.
 *
 * An interrupt is considered periodically stable if the interval of
 * its occurences follow the normal distribution, thus the values
 * comply with:
 *
 *      avg - 3 x stddev < value < avg + 3 x stddev
 *
 * Which can be simplified to:
 *
 *      -3 x stddev < value - avg < 3 x stddev
 *
 *      abs(value - avg) < 3 x stddev
 *
 * In order to save a costly square root computation, we use the
 * variance. For the record, stddev = sqrt(variance). The equation
 * above becomes:
 *
 *      abs(value - avg) < 3 x sqrt(variance)
 *
 * And finally we square it:
 *
 *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2
 *
 *      (value - avg) x (value - avg) < 9 x variance
 *
 * Statistically speaking, any values out of this interval is
 * considered as an anomaly and is discarded. However, a normal
 * distribution appears when the number of samples is 30 (it is the
 * rule of thumb in statistics, cf. "30 samples" on Internet). When
 * there are three consecutive anomalies, the statistics are resetted.
 *
 */
static void irqs_update(struct irqt_stat *irqs, u64 ts)
{
	u64 old_ts = irqs->lts;
	u64 variance = 0;
	u64 interval;
	s64 diff;

	/*
	 * The timestamps are absolute time values, we need to compute
	 * the timing interval between two interrupts.
	 */
	irqs->lts = ts;

	/*
	 * The interval type is u64 in order to deal with the same
	 * type in our computation, that prevent mindfuck issues with
	 * overflow, sign and division.
	 */
	interval = ts - old_ts;

	/*
	 * The interrupt triggered more than one second apart, that
	 * ends the sequence as predictible for our purpose. In this
	 * case, assume we have the beginning of a sequence and the
	 * timestamp is the first value. As it is impossible to
	 * predict anything at this point, return.
	 *
	 * Note the first timestamp of the sequence will always fall
	 * in this test because the old_ts is zero. That is what we
	 * want as we need another timestamp to compute an interval.
	 */
	if (interval >= NSEC_PER_SEC) {
		memset(irqs, 0, sizeof(*irqs));
		irqs->lts = ts;
		return;
	}

	/*
	 * Pre-compute the delta with the average as the result is
	 * used several times in this function.
	 */
	diff = interval - irqs->avg;

	/*
	 * Increment the number of samples.
	 */
	irqs->count++;

	/*
	 * Online variance divided by the number of elements if there
	 * is more than one sample.  Normally the formula is division
	 * by count - 1 but we assume the number of element will be
	 * more than 32 and dividing by 32 instead of 31 is enough
	 * precise.
	 */
	if (likely(irqs->count > 1))
		variance = irqs->variance >> IRQ_TIMINGS_SHIFT;

	/*
	 * The rule of thumb in statistics for the normal distribution
	 * is having at least 30 samples in order to have the model to
	 * apply. Values outside the interval are considered as an
	 * anomaly.
	 */
	if ((irqs->count >= 30) && ((diff * diff) > (9 * variance))) {
		/*
		 * After three consecutive anomalies, we reset the
		 * stats as it is no longer stable enough.
		 */
		if (irqs->anomalies++ >= 3) {
			memset(irqs, 0, sizeof(*irqs));
			irqs->lts = ts;
			return;
		}
	} else {
		/*
		 * The anomalies must be consecutives, so at this
		 * point, we reset the anomalies counter.
		 */
		irqs->anomalies = 0;
	}

	/*
	 * The interrupt is considered stable enough to try to predict
	 * the next event on it.
	 */
	irqs->valid = 1;

	/*
	 * Online average algorithm:
	 *
	 *  new_average = average + ((value - average) / count)
	 *
	 * The variance computation depends on the new average
	 * to be computed here first.
	 *
	 */
	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT);

	/*
	 * Online variance algorithm:
	 *
	 *  new_variance = variance + (value - average) x (value - new_average)
	 *
	 * Warning: irqs->avg is updated with the line above, hence
	 * 'interval - irqs->avg' is no longer equal to 'diff'
	 */
	irqs->variance = irqs->variance + (diff * (interval - irqs->avg));

	/*
	 * Update the next event
	 */
	irqs->ne = ts + irqs->avg;
}

/**
 * irq_timings_rate : Returns the number of interrupt per jiffies
 *
 * @now: the local cpu time in nsec
 *
 * Computes a rough average of the interrupt intervals and its growth
 * compared to the last interrupt event and now.
 *
 * Returns the interrupt rate (nsec based), the meaning of the value
 * is the mean interval interrupt value. A negative value means the
 * rate is decreasing, a positive value means the rate is increasing.
 */
u64 irq_timings_rate(u64 now)
{
	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
	int not_used;
	u64 ts, rate = 0;

	if (irqts->count) {
		/*
		 * Pick the first element in the irq timings array and
		 * measure the average interrupt interval duration,
		 * then compare it with the last duration.
		 */
		irq_timing_decode(
			irqts->values[irqts->count & IRQ_TIMINGS_MASK],
			&ts, &not_used);

		rate = div_u64((now - ts), irqts->count);

		irq_timing_decode(
			irqts->values[(irqts->count - 1) & IRQ_TIMINGS_MASK],
			&ts, &not_used);
	}

	return rate;
}

/**
 * irq_timings_next_event - Return when the next event is supposed to arrive
 *
 * *** This function must be called with the local irq disabled ***
 *
 * During the last busy cycle, the number of interrupts is incremented
 * and stored in the irq_timings structure. This information is
 * necessary to:
 *
 * - know if the index in the table wrapped up:
 *
 *      If more than the array size interrupts happened during the
 *      last busy/idle cycle, the index wrapped up and we have to
 *      begin with the next element in the array which is the last one
 *      in the sequence, otherwise it is a the index 0.
 *
 * - have an indication of the interrupts activity on this CPU
 *   (eg. irq/sec)
 *
 * The values are 'consumed' after inserting in the statistical model,
 * thus the count is reinitialized.
 *
 * The array of values **must** be browsed in the time direction, the
 * timestamp must increase between an element and the next one.
 *
 * Returns a nanosec time based estimation of the earliest interrupt,
 * U64_MAX otherwise.
 */
u64 irq_timings_next_event(u64 now)
{
	struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
	struct irqt_stat *irqs;
	struct irqt_stat __percpu *s;
	u64 ts, ne = U64_MAX;
	int i, irq = 0;

	for_each_irqts(i, irqts) {

		irq_timing_decode(irqts->values[i], &ts, &irq);

		s = idr_find(&irqt_stats, irq);
		if (s) {
			irqs = this_cpu_ptr(s);
			irqs_update(irqs, ts);
		}
	}

	/*
	 * Look in the list of interrupts' statistics, the earliest
	 * next event.
	 */
	idr_for_each_entry(&irqt_stats, s, i) {

		irqs = this_cpu_ptr(s);

		if (!irqs->valid)
			continue;

		if (irqs->ne <= now) {
			irq = i;
			ne = now;

			/*
			 * This interrupt mustn't use in the future
			 * until new events occur and update the
			 * statistics.
			 */
			irqs->valid = 0;
			break;
		}

		if (irqs->ne < ne) {
			irq = i;
			ne = irqs->ne;
		}
	}

	if (irq)
		trace_irq_timings_next_event(irq, ne);

	return ne;
}

void irq_timings_free(int irq)
{
	struct irqt_stat __percpu *s;

	s = idr_find(&irqt_stats, irq);
	if (s) {
		free_percpu(s);
		idr_remove(&irqt_stats, irq);
	}
}

int irq_timings_alloc(int irq)
{
	int id;
	struct irqt_stat __percpu *s;

	/*
	 * Some platforms can have the same private interrupt per cpu,
	 * so this function may be be called several times with the
	 * same interrupt number. Just bail out in case the per cpu
	 * stat structure is already allocated.
	 */
	s = idr_find(&irqt_stats, irq);
	if (s)
		return 0;

	s = alloc_percpu(*s);
	if (!s)
		return -ENOMEM;

	idr_preload(GFP_KERNEL);
	id = idr_alloc(&irqt_stats, s, irq, irq + 1, GFP_NOWAIT);
	idr_preload_end();

	if (id < 0) {
		free_percpu(s);
		return id;
	}

	return 0;
}

#ifdef CONFIG_TEST_IRQ_TIMINGS
static int __init irq_timings_test_irqts(struct irq_timings *irqts,
					 unsigned count)
{
	int i, irq;
	int ots = 0xDEAD, oirq = 0xBEEF;
	int start = count > IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
	u64 ts;

	/*
	 * Fill the circular buffer by using the dedicated function.
	 */
	for (i = 0; i < count; i++) {

		pr_debug("%d (%d): %X %X\n", i, i & IRQ_TIMINGS_MASK,
			 ots + i, oirq + i);

		irq_timings_push(ots + i, oirq + i);
	}

	/*
	 * Compute the first elements values after the index wrapped
	 * up or not.
	 */
	ots += start;
	oirq += start;

	/*
	 * Test the circular buffer count is correct.
	 */
	pr_debug(" -- Checking timings array count (%d) is right\n", count);
	if (WARN_ON(irqts->count != count))
		return -EINVAL;

	/*
	 * Test the macro allowing to browse all the irqts.
	 */
	pr_debug(" -- Checking the for_each_irqts() macro\n");
	for_each_irqts(i, irqts) {

		irq_timing_decode(irqts->values[i], &ts, &irq);

		pr_debug("%d: %llX (%X) %X (%X)\n", i, ts, ots, irq, oirq);

		if (WARN_ON(ts != ots || irq != oirq))
			return -EINVAL;

		ots++; oirq++;
	}

	/*
	 * The circular buffer should have be flushed when browsed
	 * with for_each_irqts
	 */
	pr_debug(" -- Checking timings array is empty after browsing it\n");
	if (WARN_ON(irqts->count))
		return -EINVAL;

	return 0;
}

static int __init irq_timings_selftest(void)
{
        struct irq_timings *irqts = this_cpu_ptr(&irq_timings);

	/*
	 * Test the circular buffer with different number of
	 * elements. The purpose is to test at the limits (empty, half
	 * full, full, wrapped with the cursor at the boundaries,
	 * wrapped several times, etc ...
	 */
	int i, ret, count[] = { 0,
				IRQ_TIMINGS_SIZE >> 1,
				IRQ_TIMINGS_SIZE,
				IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
				2 * IRQ_TIMINGS_SIZE,
				(2 * IRQ_TIMINGS_SIZE) + 3,
	};

	pr_info("Starting irq timings selftest\n");

	/*
	 * At this point, we don't except any subsystem to use the irq
	 * timings but us, so it should not be enabled.
	 */
        if (static_branch_unlikely(&irq_timing_enabled)) {
		pr_warn("irq timings already initialized, skipping selftest\n");
		return 0;
	}

	for (i = 0; i < ARRAY_SIZE(count); i++) {

		pr_info(" -- Checking the timings with %d/%d values\n",
			count[i], IRQ_TIMINGS_SIZE);

		ret = irq_timings_test_irqts(irqts, count[i]);
		if (ret)
			return ret;
	}

	pr_info("Irq timings selftest passed\n");

	return 0;
}
early_initcall(irq_timings_selftest);
#endif