#!/usr/bin/env python
# coding: utf-8

# - Currently using a pretty naive minimization algorithm. It will approximate the gradient and move in that direction. This requires lots of function calls. 
# - We have no way to get the Hessian - which is needed for the variance estimates of our parameters. 

# ### Automatic differentiation - how to differentiate an algorithm
# 
# Symbolic differentiation is what we learned in school, and software like SymPy, Wolfram and Mathematica do this well. But I want to differentiate the following:

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
from matplotlib import pyplot as plt


def f(x):
    y = 0.
    for i in range(100):
        y = np.sin(y + x)
    return y

x = np.linspace(-2, 2, 100)
plt.plot(x, [f(x_) for x_ in x])


# Mathematically, it's something like:
# 
# $$f(x) = \sin(x + \sin(x + \sin(x + ...\sin(x)))...)$$
# 
# Good luck differentiating that and getting a nice closed form. If this is not complicated enough for you, feel free to add some `if` statements. 
# 
# We can use `autograd`, an automatical diff package, to compute _exact, pointwise_ derivatives.

# In[2]:


from autograd import grad
from autograd import numpy as np

def f(x):
    y = 0.
    for i in range(100):
        y = np.sin(y + x)
    return y


grad(f)(1.)
# magic! 


# Of course, you can string together these pointwise derivatives into a function:

# In[3]:


grad_f = grad(f)

x = np.linspace(-2, 2, 100)
plt.plot(x, [grad_f(x_) for x_ in x])


# To use this in our optimization routines:
# 
#  - we can automatically compute gradients that the optimizer can use. 
#  - Hessians can be exactly calculated (we will do this later)
# 

# In[4]:


T = (np.random.exponential(size=1000)/1.5) ** 2.3
E = np.random.binomial(1, 0.95, size=1000)


# In[5]:


def cumulative_hazard(params, t):
    lambda_, rho_ = params
    return (t / lambda_) ** rho_

def log_hazard(params, t):
    lambda_, rho_ = params
    return np.log(rho_) - np.log(lambda_) + (rho_ - 1) * (np.log(t) - np.log(lambda_))

def log_likelihood(params, t, e):
    return np.sum(e * log_hazard(params, t)) - np.sum(cumulative_hazard(params, t))

def negative_log_likelihood(params, t, e):
    return -log_likelihood(params, t, e)


# So `grad(negative_log_likelihood)` will find the gradient of `negative_log_likelihood` with respect to the first parameter, `params`. 

# In[6]:


grad_negative_log_likelihood = grad(negative_log_likelihood)

print(grad_negative_log_likelihood(np.array([1., 1.]), T, E))


# In[7]:


from scipy.optimize import minimize

results = minimize(negative_log_likelihood, 
        x0 = np.array([1.0, 1.0]),
        method=None, 
        args=(T, E),
        jac=grad_negative_log_likelihood,
        bounds=((0.00001, None), (0.00001, None)))

print(results)


# In[8]:


from autograd import value_and_grad

results = minimize(
        value_and_grad(negative_log_likelihood), 
        x0 = np.array([1.0, 1.0]),
        method=None, 
        args=(T, E),
        jac=True, # notice this set to True now.
        bounds=((0.00001, None), (0.00001, None)))


# In[9]:


results


# Let's continue this analytical-train 🚂 to Part 4!