Computer Vision/CS231N

[CS231N] Assignment 3 Q1. Image Captioning with Vanilla RNNs

씨주 2025. 4. 3. 12:09

Assignment 3: https://cs231n.github.io/assignments2024/assignment3/#q1-image-captioning-with-vanilla-rnns

 

Assignment 3

This assignment is due on Tuesday, May 28 2024 at 11:59pm PST. Starter code containing Colab notebooks can be downloaded here. Setup Please familiarize yourself with the recommended workflow before starting the assignment. You should also watch the Colab w

cs231n.github.io

 

https://wikidocs.net/22886

  • 일반 신경망(MLP, CNN) = 시험 문제를 한 문제씩 풀면서 이전 문제를 전혀 기억하지 못하는 학생
  • RNN = 시험 문제를 풀 때 앞에서 풀었던 문제를 참고할 수 있는 학생

 

def rnn_step_forward(x, prev_h, Wx, Wh, b):
    next_h, cache = None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    q = np.dot(prev_h, Wh) + np.dot(x, Wx) + b
    next_h = np.tanh(q)
    cache = x, prev_h, next_h, Wx, Wh, b, q

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return next_h, cache
def rnn_step_backward(dnext_h, cache):
    dx, dprev_h, dWx, dWh, db = None, None, None, None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x, prev_h, next_h, Wx, Wh, b, q = cache
    dq = dnext_h * (1 - next_h ** 2)
    dx = np.dot(dq, Wx.T)
    dprev_h = np.dot(dq, Wh.T)
    dWx = np.dot(x.T, dq)
    dWh = np.dot(prev_h.T, dq)
    db = np.sum(dq, axis=0)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return dx, dprev_h, dWx, dWh, db
def rnn_forward(x, h0, Wx, Wh, b):
    h, cache = None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    N, T, D = x.shape
    H = h0.shape[1]
    cache = []
    h = []
    prev_h = h0
    for t in range(T):
        prev_h, cache_t = rnn_step_forward(x[:, t, :], prev_h, Wx, Wh, b)
        h.append(prev_h)
        cache.append(cache_t)
    h = np.array(h)
    h = h.transpose(1, 0, 2)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return h, cache
def rnn_backward(dh, cache):
    dx, dh0, dWx, dWh, db = None, None, None, None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    N, T, H = dh.shape
    _x, _, _, _, _, _, _ = cache[0]
    _, D = np.shape(_x)

    dx = np.zeros((N, T, D))
    prev_dh = np.zeros((N, H))
    dWx = np.zeros((D, H))
    dWh = np.zeros((H, H))
    db = np.zeros(H)

    for t in reversed(range(0, T)):
        dx[:,t,:], prev_dh, dWx_t, dWh_t, db_t = rnn_step_backward(dh[:,t,:] + prev_dh, cache[t])
        dWx += dWx_t
        dWh += dWh_t
        db += db_t
    dh0 = prev_dh

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return dx, dh0, dWx, dWh, db

 

word embedding

def word_embedding_forward(x, W):
    out, cache = None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    out = W[x, :]
    cache = x, W

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return out, cache
def word_embedding_backward(dout, cache):
    dW = None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x, W = cache
    dW = np.zeros_like(W)
    np.add.at(dW, x, dout)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return dW

 

LSTM(Long Short-Term Memory) Layer

RNN의 단점 중 하나인 기울기손실로 예방하기 위한 방법

https://wikidocs.net/152773

 

def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
    next_h, next_c, cache = None, None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    H = Wh.shape[0]

    a = np.dot(prev_h, Wh) + np.dot(x, Wx) + b # 4개의 게이트를 위한 선형 변환
    ai = a[:, 0*H:1*H] # input gate
    af = a[:, 1*H:2*H] # forget gate
    ao = a[:, 2*H:3*H] # output gate
    ag = a[:, 3*H:4*H] # candidate cell state

    i = sigmoid(ai) # 얼마나 새로운 정보를 받을지
    f = sigmoid(af) # 얼마나 과거 정보를 유지할지
    o = sigmoid(ao) # 최종 hidden state에 얼마나 반영할지
    g = np.tanh(ag) # 새로운 기억 후보

    next_c = f * prev_c + i * g
    next_h = o * np.tanh(next_c)

    cache = x, prev_h, prev_c, Wx, Wh, b, i, f, o, g, ai, af, ao, ag, next_c

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return next_h, next_c, cache
def lstm_step_backward(dnext_h, dnext_c, cache):
    dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    x, prev_h, prev_c, Wx, Wh, b, i, f, o, g, ai, af, ao, ag, next_c = cache

    N, H = np.shape(dnext_h)

    do = np.tanh(next_c) * dnext_h
    dtanh_next_c = o * dnext_h
    dnext_c_total = dnext_c + (1 - np.tanh(next_c)**2) * dtanh_next_c

    df = prev_c * dnext_c_total
    dprev_c = f * dnext_c_total
    di = g * dnext_c_total
    dg = i * dnext_c_total

    dai = sigmoid(ai) * (1 - sigmoid(ai)) * di
    daf = sigmoid(af) * (1 - sigmoid(af)) * df
    dao = sigmoid(ao) * (1 - sigmoid(ao)) * do
    dag = (1 - np.tanh(ag)**2) * dg

    da = np.zeros((N, 4*H))
    da[:, 0*H:1*H] = dai
    da[:, 1*H:2*H] = daf
    da[:, 2*H:3*H] = dao
    da[:, 3*H:4*H] = dag

    dx = np.dot(da, Wx.T)
    dprev_h = np.dot(da, Wh.T)
    dWx = np.dot(x.T, da)
    dWh = np.dot(prev_h.T, da)
    db = np.sum(da, axis=0)

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return dx, dprev_h, dprev_c, dWx, dWh, db
def lstm_forward(x, h0, Wx, Wh, b):
    h, cache = None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    N, T, D = x.shape
    H = h0.shape[1]
    h = np.zeros((N, T, H))
    cache = []
    prev_h = h0
    prev_c = np.zeros_like(prev_h)
    for t in range(T):
        next_h, next_c, cache_t = lstm_step_forward(x[:,t], prev_h, prev_c, Wx, Wh, b)
        h[:, t] = next_h
        cache.append(cache_t)
        prev_h = next_h
        prev_c = next_c

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return h, cache
def lstm_backward(dh, cache):
    dx, dh0, dWx, dWh, db = None, None, None, None, None
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    N, T, H = dh.shape
    _x = cache[0][0]
    D = _x.shape[1]

    dx = np.zeros((N, T, D))
    prev_dh = np.zeros((N, H))
    prev_dc = np.zeros((N, H))
    dWx = np.zeros((D, 4*H))
    dWh = np.zeros((H, 4*H))
    db = np.zeros(4*H)

    for t in reversed(range(0, T)):
        dx[:,t,:], prev_dh, prev_dc, dWx_t, dWh_t, db_t = lstm_step_backward(dh[:,t,:] + prev_dh, prev_dc, cache[t])
        dWx += dWx_t
        dWh += dWh_t
        db += db_t
    dh0 = prev_dh

    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    return dx, dh0, dWx, dWh, db

 

Temporal Affine Layer

이미지 embedding과 word embedding을 결합하여 최종 captioning을 진행

def temporal_affine_forward(x, w, b):
    N, T, D = x.shape
    M = b.shape[0]
    out = x.reshape(N * T, D).dot(w).reshape(N, T, M) + b
    cache = x, w, b, out
    return out, cache
def temporal_affine_backward(dout, cache):
    x, w, b, out = cache
    N, T, D = x.shape
    M = b.shape[0]

    dx = dout.reshape(N * T, M).dot(w.T).reshape(N, T, D)
    dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T
    db = dout.sum(axis=(0, 1))

    return dx, dw, db
def temporal_softmax_loss(x, y, mask, verbose=False):
    N, T, V = x.shape

    x_flat = x.reshape(N * T, V)
    y_flat = y.reshape(N * T)
    mask_flat = mask.reshape(N * T)

    probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
    dx_flat = probs.copy()
    dx_flat[np.arange(N * T), y_flat] -= 1
    dx_flat /= N
    dx_flat *= mask_flat[:, None]

    if verbose:
        print("dx_flat: ", dx_flat.shape)

    dx = dx_flat.reshape(N, T, V)

    return loss, dx

 

최종 RNN

    def loss(self, features, captions):
        # token, and the first element of captions_out will be the first word.
        captions_in = captions[:, :-1]
        captions_out = captions[:, 1:]

        # You'll need this
        mask = captions_out != self._null

        # Weight and bias for the affine transform from image features to initial
        # hidden state
        W_proj, b_proj = self.params["W_proj"], self.params["b_proj"]

        # Word embedding matrix
        W_embed = self.params["W_embed"]

        # Input-to-hidden, hidden-to-hidden, and biases for the RNN
        Wx, Wh, b = self.params["Wx"], self.params["Wh"], self.params["b"]

        # Weight and bias for the hidden-to-vocab transformation.
        W_vocab, b_vocab = self.params["W_vocab"], self.params["b_vocab"]

        loss, grads = 0.0, {}
        
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

        out_proj, cache_proj = affine_forward(features, W_proj, b_proj)
        out_word_vector, cache_word_vector = word_embedding_forward(captions_in, W_embed)

        if self.cell_type == 'lstm':
            out_hidden, cache_hidden = lstm_forward(out_word_vector, out_proj, Wx, Wh, b)
        else:
            out_hidden, cache_hidden = rnn_forward(out_word_vector, out_proj, Wx, Wh, b)

        out_temporal_affine, cache_temporal_affine = temporal_affine_forward(out_hidden, W_vocab, b_vocab)
        loss, dscores = temporal_softmax_loss(out_temporal_affine, captions_out, mask)
        dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dscores, cache_temporal_affine)

        if self.cell_type == 'lstm':
            dx, dh0, grads['Wx'], grads['Wh'], grads['b'] = lstm_backward(dh, cache_hidden)
        else:
            dx, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, cache_hidden)

        grads['W_embed'] = word_embedding_backward(dx, cache_word_vector)
        _, grads['W_proj'], grads['b_proj'] = affine_backward(dh0, cache_proj)

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        return loss, grads
    def sample(self, features, max_length=30):
        N = features.shape[0]
        captions = self._null * np.ones((N, max_length), dtype=np.int32)

        # Unpack parameters
        W_proj, b_proj = self.params["W_proj"], self.params["b_proj"]
        W_embed = self.params["W_embed"]
        Wx, Wh, b = self.params["Wx"], self.params["Wh"], self.params["b"]
        W_vocab, b_vocab = self.params["W_vocab"], self.params["b_vocab"]
        
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

        prev_h, _ = affine_forward(features, W_proj, b_proj)
        prev_c = np.zeros_like(prev_h)

        captions[:, 0] = self._start
        curr_word = np.ones((N, 1), dtype=np.int32) * self._start

        for t in range(max_length - 1):
            word_embed, _ = word_embedding_forward(curr_word, W_embed)
            if self.cell_type == 'lstm':
                h, c, _ = lstm_step_forward(np.squeeze(word_embed), prev_h, prev_c, Wx, Wh, b)
            else:
                h, _ = rnn_step_forward(np.squeeze(word_embed), prev_h, Wx, Wh, b)
            scores, _ = temporal_affine_forward(h[:, np.newaxis, :], W_vocab, b_vocab)
            idx_best = np.squeeze(np.argmax(scores, axis=2))
            captions[:, t+1] = idx_best
            curr_word = captions[:, t+1]
            prev_h = h
            if self.cell_type == 'lstm':
                prev_c = c

        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        return captions

 

 

참고: https://github.com/chongyangma/cs231n/blob/master/assignments/assignment3/cs231n/classifiers/rnn.py

https://github.com/lionkingchuchu/cs231n/blob/main/assignment3/cs231n/transformer_layers.py

 

 

 

'Computer Vision > CS231N' 카테고리의 다른 글

[CS231N] Assignment 1 Q1. k-Nearest Neighbor classifier  (0) 2024.12.23