Assignment 3: https://cs231n.github.io/assignments2024/assignment3/#q1-image-captioning-with-vanilla-rnns
Assignment 3
This assignment is due on Tuesday, May 28 2024 at 11:59pm PST. Starter code containing Colab notebooks can be downloaded here. Setup Please familiarize yourself with the recommended workflow before starting the assignment. You should also watch the Colab w
cs231n.github.io

- 일반 신경망(MLP, CNN) = 시험 문제를 한 문제씩 풀면서 이전 문제를 전혀 기억하지 못하는 학생
- RNN = 시험 문제를 풀 때 앞에서 풀었던 문제를 참고할 수 있는 학생

def rnn_step_forward(x, prev_h, Wx, Wh, b):
next_h, cache = None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
q = np.dot(prev_h, Wh) + np.dot(x, Wx) + b
next_h = np.tanh(q)
cache = x, prev_h, next_h, Wx, Wh, b, q
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return next_h, cache
def rnn_step_backward(dnext_h, cache):
dx, dprev_h, dWx, dWh, db = None, None, None, None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
x, prev_h, next_h, Wx, Wh, b, q = cache
dq = dnext_h * (1 - next_h ** 2)
dx = np.dot(dq, Wx.T)
dprev_h = np.dot(dq, Wh.T)
dWx = np.dot(x.T, dq)
dWh = np.dot(prev_h.T, dq)
db = np.sum(dq, axis=0)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dx, dprev_h, dWx, dWh, db
def rnn_forward(x, h0, Wx, Wh, b):
h, cache = None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, T, D = x.shape
H = h0.shape[1]
cache = []
h = []
prev_h = h0
for t in range(T):
prev_h, cache_t = rnn_step_forward(x[:, t, :], prev_h, Wx, Wh, b)
h.append(prev_h)
cache.append(cache_t)
h = np.array(h)
h = h.transpose(1, 0, 2)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return h, cache
def rnn_backward(dh, cache):
dx, dh0, dWx, dWh, db = None, None, None, None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, T, H = dh.shape
_x, _, _, _, _, _, _ = cache[0]
_, D = np.shape(_x)
dx = np.zeros((N, T, D))
prev_dh = np.zeros((N, H))
dWx = np.zeros((D, H))
dWh = np.zeros((H, H))
db = np.zeros(H)
for t in reversed(range(0, T)):
dx[:,t,:], prev_dh, dWx_t, dWh_t, db_t = rnn_step_backward(dh[:,t,:] + prev_dh, cache[t])
dWx += dWx_t
dWh += dWh_t
db += db_t
dh0 = prev_dh
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dx, dh0, dWx, dWh, db
word embedding
def word_embedding_forward(x, W):
out, cache = None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
out = W[x, :]
cache = x, W
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return out, cache
def word_embedding_backward(dout, cache):
dW = None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
x, W = cache
dW = np.zeros_like(W)
np.add.at(dW, x, dout)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dW
LSTM(Long Short-Term Memory) Layer
RNN의 단점 중 하나인 기울기손실로 예방하기 위한 방법


def lstm_step_forward(x, prev_h, prev_c, Wx, Wh, b):
next_h, next_c, cache = None, None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
H = Wh.shape[0]
a = np.dot(prev_h, Wh) + np.dot(x, Wx) + b # 4개의 게이트를 위한 선형 변환
ai = a[:, 0*H:1*H] # input gate
af = a[:, 1*H:2*H] # forget gate
ao = a[:, 2*H:3*H] # output gate
ag = a[:, 3*H:4*H] # candidate cell state
i = sigmoid(ai) # 얼마나 새로운 정보를 받을지
f = sigmoid(af) # 얼마나 과거 정보를 유지할지
o = sigmoid(ao) # 최종 hidden state에 얼마나 반영할지
g = np.tanh(ag) # 새로운 기억 후보
next_c = f * prev_c + i * g
next_h = o * np.tanh(next_c)
cache = x, prev_h, prev_c, Wx, Wh, b, i, f, o, g, ai, af, ao, ag, next_c
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return next_h, next_c, cache
def lstm_step_backward(dnext_h, dnext_c, cache):
dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
x, prev_h, prev_c, Wx, Wh, b, i, f, o, g, ai, af, ao, ag, next_c = cache
N, H = np.shape(dnext_h)
do = np.tanh(next_c) * dnext_h
dtanh_next_c = o * dnext_h
dnext_c_total = dnext_c + (1 - np.tanh(next_c)**2) * dtanh_next_c
df = prev_c * dnext_c_total
dprev_c = f * dnext_c_total
di = g * dnext_c_total
dg = i * dnext_c_total
dai = sigmoid(ai) * (1 - sigmoid(ai)) * di
daf = sigmoid(af) * (1 - sigmoid(af)) * df
dao = sigmoid(ao) * (1 - sigmoid(ao)) * do
dag = (1 - np.tanh(ag)**2) * dg
da = np.zeros((N, 4*H))
da[:, 0*H:1*H] = dai
da[:, 1*H:2*H] = daf
da[:, 2*H:3*H] = dao
da[:, 3*H:4*H] = dag
dx = np.dot(da, Wx.T)
dprev_h = np.dot(da, Wh.T)
dWx = np.dot(x.T, da)
dWh = np.dot(prev_h.T, da)
db = np.sum(da, axis=0)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dx, dprev_h, dprev_c, dWx, dWh, db
def lstm_forward(x, h0, Wx, Wh, b):
h, cache = None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, T, D = x.shape
H = h0.shape[1]
h = np.zeros((N, T, H))
cache = []
prev_h = h0
prev_c = np.zeros_like(prev_h)
for t in range(T):
next_h, next_c, cache_t = lstm_step_forward(x[:,t], prev_h, prev_c, Wx, Wh, b)
h[:, t] = next_h
cache.append(cache_t)
prev_h = next_h
prev_c = next_c
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return h, cache
def lstm_backward(dh, cache):
dx, dh0, dWx, dWh, db = None, None, None, None, None
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, T, H = dh.shape
_x = cache[0][0]
D = _x.shape[1]
dx = np.zeros((N, T, D))
prev_dh = np.zeros((N, H))
prev_dc = np.zeros((N, H))
dWx = np.zeros((D, 4*H))
dWh = np.zeros((H, 4*H))
db = np.zeros(4*H)
for t in reversed(range(0, T)):
dx[:,t,:], prev_dh, prev_dc, dWx_t, dWh_t, db_t = lstm_step_backward(dh[:,t,:] + prev_dh, prev_dc, cache[t])
dWx += dWx_t
dWh += dWh_t
db += db_t
dh0 = prev_dh
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return dx, dh0, dWx, dWh, db
Temporal Affine Layer
이미지 embedding과 word embedding을 결합하여 최종 captioning을 진행
def temporal_affine_forward(x, w, b):
N, T, D = x.shape
M = b.shape[0]
out = x.reshape(N * T, D).dot(w).reshape(N, T, M) + b
cache = x, w, b, out
return out, cache
def temporal_affine_backward(dout, cache):
x, w, b, out = cache
N, T, D = x.shape
M = b.shape[0]
dx = dout.reshape(N * T, M).dot(w.T).reshape(N, T, D)
dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T
db = dout.sum(axis=(0, 1))
return dx, dw, db
def temporal_softmax_loss(x, y, mask, verbose=False):
N, T, V = x.shape
x_flat = x.reshape(N * T, V)
y_flat = y.reshape(N * T)
mask_flat = mask.reshape(N * T)
probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
probs /= np.sum(probs, axis=1, keepdims=True)
loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
dx_flat = probs.copy()
dx_flat[np.arange(N * T), y_flat] -= 1
dx_flat /= N
dx_flat *= mask_flat[:, None]
if verbose:
print("dx_flat: ", dx_flat.shape)
dx = dx_flat.reshape(N, T, V)
return loss, dx
최종 RNN
def loss(self, features, captions):
# token, and the first element of captions_out will be the first word.
captions_in = captions[:, :-1]
captions_out = captions[:, 1:]
# You'll need this
mask = captions_out != self._null
# Weight and bias for the affine transform from image features to initial
# hidden state
W_proj, b_proj = self.params["W_proj"], self.params["b_proj"]
# Word embedding matrix
W_embed = self.params["W_embed"]
# Input-to-hidden, hidden-to-hidden, and biases for the RNN
Wx, Wh, b = self.params["Wx"], self.params["Wh"], self.params["b"]
# Weight and bias for the hidden-to-vocab transformation.
W_vocab, b_vocab = self.params["W_vocab"], self.params["b_vocab"]
loss, grads = 0.0, {}
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
out_proj, cache_proj = affine_forward(features, W_proj, b_proj)
out_word_vector, cache_word_vector = word_embedding_forward(captions_in, W_embed)
if self.cell_type == 'lstm':
out_hidden, cache_hidden = lstm_forward(out_word_vector, out_proj, Wx, Wh, b)
else:
out_hidden, cache_hidden = rnn_forward(out_word_vector, out_proj, Wx, Wh, b)
out_temporal_affine, cache_temporal_affine = temporal_affine_forward(out_hidden, W_vocab, b_vocab)
loss, dscores = temporal_softmax_loss(out_temporal_affine, captions_out, mask)
dh, grads['W_vocab'], grads['b_vocab'] = temporal_affine_backward(dscores, cache_temporal_affine)
if self.cell_type == 'lstm':
dx, dh0, grads['Wx'], grads['Wh'], grads['b'] = lstm_backward(dh, cache_hidden)
else:
dx, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, cache_hidden)
grads['W_embed'] = word_embedding_backward(dx, cache_word_vector)
_, grads['W_proj'], grads['b_proj'] = affine_backward(dh0, cache_proj)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return loss, grads
def sample(self, features, max_length=30):
N = features.shape[0]
captions = self._null * np.ones((N, max_length), dtype=np.int32)
# Unpack parameters
W_proj, b_proj = self.params["W_proj"], self.params["b_proj"]
W_embed = self.params["W_embed"]
Wx, Wh, b = self.params["Wx"], self.params["Wh"], self.params["b"]
W_vocab, b_vocab = self.params["W_vocab"], self.params["b_vocab"]
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
prev_h, _ = affine_forward(features, W_proj, b_proj)
prev_c = np.zeros_like(prev_h)
captions[:, 0] = self._start
curr_word = np.ones((N, 1), dtype=np.int32) * self._start
for t in range(max_length - 1):
word_embed, _ = word_embedding_forward(curr_word, W_embed)
if self.cell_type == 'lstm':
h, c, _ = lstm_step_forward(np.squeeze(word_embed), prev_h, prev_c, Wx, Wh, b)
else:
h, _ = rnn_step_forward(np.squeeze(word_embed), prev_h, Wx, Wh, b)
scores, _ = temporal_affine_forward(h[:, np.newaxis, :], W_vocab, b_vocab)
idx_best = np.squeeze(np.argmax(scores, axis=2))
captions[:, t+1] = idx_best
curr_word = captions[:, t+1]
prev_h = h
if self.cell_type == 'lstm':
prev_c = c
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
return captions
참고: https://github.com/chongyangma/cs231n/blob/master/assignments/assignment3/cs231n/classifiers/rnn.py
https://github.com/lionkingchuchu/cs231n/blob/main/assignment3/cs231n/transformer_layers.py
'Computer Vision > CS231N' 카테고리의 다른 글
[CS231N] Assignment 1 Q1. k-Nearest Neighbor classifier (0) | 2024.12.23 |
---|