from manipulation.exercises.grader import set_grader_throws
set_grader_throws(True)

from pydrake.common.deprecation import DrakeDeprecationWarning
import warnings
warnings.simplefilter("error", DrakeDeprecationWarning)

try:
    from manipulation.utils import set_running_as_test
    set_running_as_test(True)

except ModuleNotFoundError:
    pass
#!/usr/bin/env python
# coding: utf-8

# # Contrastive Loss in Dense Object Nets

# In[ ]:


import numpy as np


# ## Intro
# In this notebook, you will be working on implementing the loss for [dense object net](https://arxiv.org/abs/1806.08756).
# 
# When doing robotic manipulation with perception, we sometimes need to find pixel correspondences between two images. For example, we have an image of a mug that we've pre-computed optimal contact position on the handle. We know the pixel that correspond to the contact position in this source image. Now, given an image of a cluttered scene containing the mug, we want to identify the pixel that correspond to the same contact position on handle in order to transfer the grasp to the new scene.
# 
# <p style="text-align:center;"><img src="https://iili.io/tSulS4.png" width="500"></p>
# 
# In reality, we don't want to train one correspondence model for every single possible interested point on the object. Instead, we hope to train one model that can help us establish correspondence for arbitary points on the object, such as handle, joint, a point on lid etc.
# 
# To achieve this, we use convolution neural network to parameterize a dense descriptor $f$. Given an image $I$ of shape HxWxD, $f$ takes in $I$ and outputs a D-dimensional feature vector at every pixel. That is, $f(I) \in \mathbb{R}^{H\times W\times D}$. We can now specify an index $(i, j)$ in 2d image frame, the feature vector $f(I)[i,j]$ 'describes' the sematic meaning of pixel $I[i,j]$. Intuitively, if the feature at two pixel locations are similiar, we should get a similiar descriptor vector. Notice $f$ is a fully convolutional neural network, so $f(I)[i,j]$ depends on not just pixel $I[i,j]$ but also its neighborhood.
# 
# We now offer a (loose) mathematical formulation of 'describes':
# Given two images, $I_a, I_b$ and a pair of coordinates, $(u_a, v_a)$ and $(u_b, v_b)$, if the pixel $I_a[i_a, j_a]$ and $I_b[i_b, j_b]$ correspond to the same point on the object (e.g tip of a pencil), we hope $|| f(I_a)[i_a, j_a], f(I_b)[i_b, j_b]||_2$ is as small as possible. If they correspond to distinct points on the object (e.g. pencil tip and rubber), $|| f(I_a)[i_a, j_a], f(I_b)[i_b, j_b]||_2$ should be as big as possible.
# 
# If we project the D-dimensional features at each pixel location to 3-dim RGB space, we can see correponding pairs will share the same color.
# <p style="text-align:center;"><img src="https://iili.io/tSzhiJ.md.png" width="600"></p>

# ## a) Loss Implementation
# In the code block below, you are asked to implement the loss in Dense Object Net (DON). Read section 3.1 of the [paper](https://arxiv.org/pdf/1806.08756.pdf) and fill out the following function. In DON paper, $u$ instead of $(i, j)$ is used as the notation for index in camera plane. The variable u in the code block uses the definition from the DON paper.
# 
# By deep learning convention, you are not allowed to use any kind of loops in the function to make your code fast to compute.
# 
# 
# Note: In deep learning, the data always come in batches. So a batch of indices u will have shape (N, 2) for batch size. Usually images come in batch too, but in DON we are sampling a large batch size of indices for every image, therefore input images are not batched here.
# 

# In[ ]:


def don_loss(f, img_a, img_b, u_a, u_b, match, margin=2.0):
    """
    Compute DON loss with a batch of data
    Args:
        f: a neural network that takes in a batch of images with 3 channels and outputs dense features with D channels for each pixel location. e.g. f(I) has shape (N, H, W, D) for I of shape (N, H, W, 3)
        img_a: np.ndarray with shape (H, W, 3), an image
        img_b: np.ndarray with shape (H, W, 3), an image
        u_a: np.ndarray with shape (N, 2), a batch of indices (row_idx, col_idx) to index location in img_a
        u_b: np.ndarray with shape (N, 2), a batch of indices (row_idx, col_idx) to index location in img_b
        match: np.ndarray with shape (N, 1), a batch of boolean variables that indicates match or not
        margin: the margin parameter M in DON paper section 3.1
    Return:
        loss_matches: a float whose value is the L_matches in DON paper section 3.1
        loss_nonmatches: a float whose value is the L_non-matches in DON paper section 3.1
    """
    ### Your code here ###
    # Note you are not allowed to use loops! Instead, use google to find needed numpy functions
    loss_matches = 0.0 # modify me
    loss_nonmatches = 0.0 # modify me

    return loss_matches, loss_nonmatches


# In[ ]:


def don_loss(f, img_a, img_b, u_a, u_b, match, margin=2.0):
    """
    Compute DON loss with a batch of data.
    Args:
        f: a neural network that takes in a batch of images with 3 channels and outputs dense features with D channels for each pixel location. e.g. f(I) has shape (N, H, W, D) for I of shape (N, H, W, 3)
        img_a: np.ndarray with shape (H, W, 3), an image
        img_b: np.ndarray with shape (H, W, 3), an image
        u_a: np.ndarray with shape (N, 2), a batch of indices (row_idx, col_idx) to index location in img_a
        u_b: np.ndarray with shape (N, 2), a batch of indices (row_idx, col_idx) to index location in img_b
        match: np.ndarray with shape (N, 1), a batch of boolean variables that indicates match or not
        margin: the margin parameter M in DON paper section 3.1
    Return:
        loss_matches: a float whose value is the L_matches in DON paper section 3.1
        loss_nonmatches: a float whose value is the L_non-matches in DON paper section 3.1
    """
    ## Your code here, you are not allowed to use loops!
    phi_a, phi_b = f(img_a[None])[0], f(img_b[None])[0]
    match = match[:, 0]
    h, w, d = phi_a.shape
    phi_a = phi_a.reshape(-1, d)[u_a[:, 0] * w + u_a[:, 1]]
    phi_b = phi_b.reshape(-1, d)[u_b[:, 0] * w + u_b[:, 1]]
    norm = np.linalg.norm(phi_a - phi_b, axis=1)
    loss_matches = np.mean(norm[match] ** 2)
    loss_nonmatches = np.mean(np.clip(margin - norm[~match], a_min=0, a_max=None) ** 2)

    return loss_matches, loss_nonmatches


# Since we are not training DON, we used numpy instead of commonly used deep learning frameworks like pytorch or jax. These frameworks have built in auto-differentiation and shares very similiar grammar with numpy. With the loss implemented, all we need to do is keep sampling data, and call auto-differentiation to perform gradient descent to train the network.

# ## b) Prediction
# After DON is trained, given image $I_a$ and a pixel location $u_a$, we can use $f$ to find a corresponding location $u_b$ in $I_b$ such that the features at these two locations are the closest. Implement the following function to do inference for DON.

# In[ ]:


def don_predict(f, img_a, img_b, u_a):
    """
    Run trained DON to find correponding point coordinate in image b.
    Args:
        f: a neural network that takes in a batch of images with 3 channels and outputs dense features with D channels for each pixel location. e.g. f(I) has shape (N, H, W, D) for I of shape (N, H, W, 3)
        img_a: np.ndarray with shape (H, W, 3), an image
        img_b: np.ndarray with shape (H, W, 3), an image
        u_a: np.ndarray with shape (2, ), row and col indices in img_a that specifies the point on object
    Return:
        u_b: np.ndarray with shape (2, ), row and col indices in img_b that correspond to the point on object
    """
    ### Your code here
    u_b = np.zeros(2) # modify me
    return u_b


# In[ ]:


def don_predict(f, img_a, img_b, u_a):
    """
    Run trained DON to find correponding point coordinate in image b.
    Args:
        f: a neural network that takes in a batch of images with 3 channels and outputs dense features with D channels for each pixel location. e.g. f(I) has shape (N, H, W, D) for I of shape (N, H, W, 3)
        img_a: np.ndarray with shape (H, W, 3), an image
        img_b: np.ndarray with shape (H, W, 3), an image
        u_a: np.ndarray with shape (2, ), row and col indices in img_a that specifies the point on object
    Return:
        u_b: np.ndarray with shape (2, ), row and col indices in img_b that correspond to the point on object
    """
    ### Your code here
    phi_a, phi_b = f(img_a[None])[0], f(img_b[None])[0]
    h, w, d = phi_a.shape
    phi_query = phi_a[u_a[0], u_a[1]]
    phi_keys = phi_b.reshape(-1, d)
    norm = np.linalg.norm(phi_keys - phi_query[None], axis=1)
    idx = np.argmin(norm)
    u_b = np.array([idx // w, idx % w])
    return u_b


# ## How will this notebook be Graded?
# 
# If you are enrolled in the class, this notebook will be graded using [Gradescope](www.gradescope.com). You should have gotten the enrollement code on our announcement in Piazza.
# 
# For submission of this assignment, you must do two things.
# - Download and submit the notebook `constrastive.ipynb` to Gradescope's notebook submission section, along with your notebook for the other problems.
# 
# We will evaluate the local functions in the notebook to see if the function behaves as we have expected. For this exercise, the rubric is as follows:
# - [4 pts] `don_loss` must be implemented correctly.
# - [4 pts] `don_predict` must be implemented correctly.

# In[ ]:


from manipulation.exercises.deep_perception.test_contrastive import TestContrastive
from manipulation.exercises.grader import Grader

Grader.grade_output([TestContrastive], [locals()], 'results.json')
Grader.print_test_results('results.json')

