๐ Transformer ํ์ด์ฌ์ผ๋ก ์ดํดํ๊ธฐ!
๐ Transformer ํ์ด์ฌ์ผ๋ก ์์ ์ ๋ณต!
โAttention is All You Needโ - 2017๋ ๊ตฌ๊ธ์ ํ๋ช ์ ์ธ ๋ ผ๋ฌธ ๐ฏ
์ด์ ์ฐ๋ฆฌ๋ Transformer๋ฅผ ์ฒ์๋ถํฐ ๋๊น์ง ํ์ด์ฌ์ผ๋ก ๊ตฌํํด๋ณด์! ๐ช
๐ฏ ๋ชฉ์ฐจ
- ๐ Transformer ๊ฐ์
- ๐ Attention์ ์ญ์ฌ
- ๐ ์ด๊ธฐ Attention (SHCA: Single-head Cross Attention)
- โก 2017๋ : Transformer์ ํ์
- ๐ง Self-Attention์ ๊ธฐ๋ณธ ์๋ฆฌ
- ๐ SHCA โ MHCA ํ๋ช ์ ๋ณํ
- ๐ฅ Transformer์ 3๋ ํ์
- ๐ ๋ฐ์ ๊ณผ์ ์์ฝ
- ๐๏ธ Transformer ๋ธ๋ก ๊ตฌํ
- ๐ฏ ์์ฝ ๋ฐ ๋ง๋ฌด๋ฆฌ
- ๐ ์ฐธ๊ณ ์๋ฃ
๐ Transformer ๊ฐ์
๐ค Transformer๊ฐ ๋ญ๊ฐ์?
Transformer๋ 2017๋ ๊ตฌ๊ธ์์ ๋ฐํํ โAttention is All You Needโ ๋ ผ๋ฌธ์์ ์ฒ์ ์๊ฐ๋ neural network ์ํคํ ์ฒ์ ๋๋ค!
๐ Attention์ ์ญ์ฌ
Attention ๊ฐ๋ ์์ฒด๋ Transformer ๋ฑ์ฅ ์ ๋ถํฐ ์ด๋ฏธ ์์์ด์! ๐ฐ๏ธ
๋ค๋ง ์ฐ๋ฆฌ๊ฐ ์๊ณ ์๋ Q / K / V ๊ฐ ์์๋ค!!
Q/K/V๋ Transformer ์ดํ ๋ช ํํ ์ ๋ฆฌ๋ ํํ์ด์ผ!!
** ์ด๊ธฐ Attention(SHCA: Single-head Cross Attention)** ๐
- LSTM์ ์ธ์ฝ๋ฉ : ๊ธฐ์กด์ LSTM์ ๋ฌธ์ฅ์ ์์๋๋ก ์ฝ๊ณ
hidden_state
๋ฅผ ์์ฑํ๋ค(๋ค์์ ๋ณด๋ ค๊ณ )!! - LSTM์ ๋์ฝ๋ฉ :
hidden_state
๋ฅผ ๋ฐํ์ผ๋ก ๊ฒฐ๊ณผ ๋จ์ด๋ฅผ ์ถ๋ ฅํ๋ค! - ์ฌ๊ธฐ์!! ๋ฌธ์ฅ์ด ๊ธธ์ด์ง์๋ก ์์ ๋ด์ฉ์ ๊น๋จน๋ ๋ฌธ์ ๊ฐ ๋ง์!!
- ๊ทธ๋์ Attention์ ์ ์ํ๋ค!! โ LSTM์์ ์ถ๋ ฅ์ ์์ฑํ ๋ hidden_state๊ฐ ์๋๋ผ ๊ณผ๊ฑฐ ๋ฌธ์ฅ์ ์ ์ฒด๋ฅผ ๋ณด๊ณ ์ด๋๋ฅผ ์ง์คํ ์ง ๋ณธ๋ค!!
๊ทธ๋์!! AI ๊ฐ ์ ๋ฆฌํ ์ด๊ธฐ Attention์ ํน์ง:
- ๐ RNN๊ณผ ํจ๊ป ์ฌ์ฉ: LSTM/GRU encoder-decoder์ ๊ฒฐํฉ
- ๐ ๋จ๋ฐฉํฅ: Decoder๊ฐ encoder๋ฅผ โ๋ณด๋โ ์ฉ๋ (Cross Attention)
- ๐ฏ ๋ฒ์ญ ๋ฌธ์ ํด๊ฒฐ: ๊ธด ๋ฌธ์ฅ์์ ์ ๋ณด ์์ค ๋ฐฉ์ง
- ๐ช Single-Head๋ง ์กด์ฌ: Multi-Head ๊ฐ๋ ์ ์์ง ์์์!
- ๐ฅ ๋ํ์ ์ฐ๊ตฌ: Bahdanau (2015), Luong (2015)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import torch
import torch.nn as nn
import torch.nn.functional as F
class OldAttention(nn.Module):
"""๐ 2015๋
Seq2Seq + Attention ๊ตฌ์กฐ ์์ (Bahdanau-style)"""
def __init__(self):
super().__init__()
# ๐น ์ธ์ฝ๋: ์
๋ ฅ ์ํ์ค๋ฅผ ์ฒ๋ฆฌํ๋ LSTM (hidden_size๋ ์ถ๋ ฅ ์ฐจ์)
self.encoder_rnn = nn.LSTM(input_size, hidden_size)
# ๐น ๋์ฝ๋: ์ถ๋ ฅ์ ์์ฐจ์ ์ผ๋ก ์์ฑํ๋ LSTM
self.decoder_rnn = nn.LSTM(input_size, hidden_size)
# ๐ธ ์ดํ
์
์ค์ฝ์ด ๊ณ์ฐ์ฉ ์ ํ ๋ ์ด์ด
# ์ธ์ฝ๋์ ๋์ฝ๋ hidden state๋ฅผ ์ฐ๊ฒฐํด ์ ์ํ (concat โ score)
self.attention = nn.Linear(hidden_size * 2, 1)
def forward(self, encoder_outputs, decoder_hidden):
"""
Args:
encoder_outputs: (seq_len, hidden_size) - ์ธ์ฝ๋์ ์ ์ฒด ์ถ๋ ฅ ์ํ์ค
decoder_hidden: (1, hidden_size) - ํ์ฌ ๋์ฝ๋์ hidden state
Returns:
context: (1, hidden_size) - ์ธ์ฝ๋ ์ถ๋ ฅ๋ค์ weighted sum
attention_weights: (seq_len,) - softmax attention weights
"""
attention_scores = []
# ๐ ๊ฐ ์ธ์ฝ๋ ์ถ๋ ฅ ๋ฒกํฐ์ ํ์ฌ ๋์ฝ๋ ์ํ๋ฅผ ๋น๊ตํ์ฌ score ๊ณ์ฐ
for encoder_output in encoder_outputs:
# ๐งฉ ์ธ์ฝ๋ ์ถ๋ ฅ๊ณผ ๋์ฝ๋ ์ํ๋ฅผ ์ฐ๊ฒฐ (concat)
# decoder_hidden ๊ฐ Q์ญํ , encoder_output๊ฐ K V ์ญํ ์ํ๋ค!!
## ์๋ํ๋ฉด decoder_hidden๋ ๋ฌด์์ ๋ณด๊ณ ์ถ์๊ฐ! Query ํ๋๊ฑฐ๊ณ ,
## encoder_output๊ฐ ์ธ์ฝ๋ ์์น์ ํน์ฑ(K), ์ค์ ์ ๋ณด๋ฅผ ๋ด๊ณ ์๋ ๋ฒกํฐ(V) ์ญํ ์ ํ๋ค!
combined = torch.cat([encoder_output, decoder_hidden], dim=1)
# ๐ ์ ํ ๋ ์ด์ด๋ฅผ ํต๊ณผ์์ผ scalar score ์ถ๋ ฅ
score = self.attention(combined) # (1, 1)
attention_scores.append(score)
# ๐ attention_scores: [(1,1), (1,1), ...] โ (seq_len, 1)
attention_scores = torch.stack(attention_scores, dim=0)
# ๐ softmax๋ก ํ๋ฅ ํํ์ฌ attention weight ๊ณ์ฐ
attention_weights = F.softmax(attention_scores, dim=0) # (seq_len, 1)
# ๐งฎ ๊ฐ ์ธ์ฝ๋ ์ถ๋ ฅ์ attention weight ๊ณฑํด์ ํฉ์ฐ (๊ฐ์คํฉ)
# encoder_outputs: (seq_len, hidden_size)
# attention_weights: (seq_len, 1) โ broadcasting
context = torch.sum(attention_weights * encoder_outputs, dim=0) # (1, hidden_size)
return context, attention_weights # context๋ ๋์ฝ๋์ ์ ๋ฌ๋๋ "์์ฝ ์ ๋ณด"
2017๋ : Transformer์ ํ์ โก
๐ญ Multi-Head Attention (MHCA) ๋ฑ์ฅ!
์ฐ๋ฆฌ๊ฐ ์๋ โAttention is all you Need!โ
RNN ์์ Attention ๋ง์ผ๋ก, ์ฌ๊ธฐ์ Q,K,V ๊ฐ๋ ์ ๋ฆฌ ๋ฐ Q=K=V self attention ๊ฐ๋ ๊ณผ
๋ค๊ฐ๋์์ ๋ณด๋ Multi-Head ๊ฐ๋ ์ ๋์ ํจ!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# Transformer์ Multi-Head Self-Attention (2017๋
)
class MultiHeadSelfAttention_2017(nn.Module):
"""ํ์ ์ ์ธ Multi-Head Self-Attention"""
def __init__(self, d_model, n_heads=8):
super().__init__()
self.n_heads = n_heads
self.d_k = d_model // n_heads # ๊ฐ head์ ์ฐจ์
# ์ฌ๋ฌ ๊ฐ์ attention head๋ฅผ ์ํ projection!
self.W_q = nn.Linear(d_model, d_model) # 8๊ฐ head ๋์์
self.W_k = nn.Linear(d_model, d_model) # 8๊ฐ head ๋์์
self.W_v = nn.Linear(d_model, d_model) # 8๊ฐ head ๋์์
self.W_o = nn.Linear(d_model, d_model) # output projection
def forward(self, x):
batch_size, seq_len, d_model = x.size()
# Q, K, V๋ฅผ multiple heads๋ก ๋ถํ
Q = self.W_q(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
K = self.W_k(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
V = self.W_v(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
# ๊ฐ head์์ ๋
๋ฆฝ์ ์ผ๋ก attention ๊ณ์ฐ!
attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
attention_weights = F.softmax(attention_scores, dim=-1)
# Multi-head attention ์ ์ฉ
context = torch.matmul(attention_weights, V)
# Concatenate heads
context = context.transpose(1, 2).contiguous().view(
batch_size, seq_len, d_model
)
# Final output projection
output = self.W_o(context)
return output, attention_weights
# ๋น๊ต: Single-Head vs Multi-Head
def compare_attention_mechanisms():
"""SHCA vs MHCA ๋น๊ต"""
# Single-Head (2015๋
๋ฐฉ์)
single_head_output = single_attention_head(x) # 1๊ฐ ๊ด์
# Multi-Head (2017๋
๋ฐฉ์)
multi_head_output = []
for head in range(8): # 8๊ฐ ๋ค๋ฅธ ๊ด์ !
head_output = attention_head(x, head_id=head)
multi_head_output.append(head_output)
# 8๊ฐ head์ ๊ฒฐ๊ณผ๋ฅผ ๊ฒฐํฉ
combined_output = concatenate_and_project(multi_head_output)
return combined_output
๐ง Self-Attention์ ๊ธฐ๋ณธ ์๋ฆฌ
Self-Attention์ โ๊ฐ ๋จ์ด๊ฐ ๋ค๋ฅธ ๋จ์ด๋ค๊ณผ ์ผ๋ง๋ ๊ด๋ จ์๋์งโ๋ฅผ ๊ณ์ฐํ๋ ๋ฉ์ปค๋์ฆ์ ๋๋ค!
1
2
3
# ์์: "The cat sat on the mat"
# "cat"์ด๋ผ๋ ๋จ์ด๊ฐ ๋ค๋ฅธ ๋จ์ด๋ค๊ณผ ์ผ๋ง๋ ๊ด๋ จ์์๊น?
# cat -> The (0.1), cat (1.0), sat (0.8), on (0.2), the (0.1), mat (0.3)
๐ Query, Key, Value ๊ฐ๋
Think of it like a search engine! ๐
- Query (Q): โ๋ด๊ฐ ์ฐพ๋ ๊ฒโ - ํ์ฌ ๋จ์ด์ ๊ด์ฌ์ฌ
- Key (K): โ๊ฒ์ ํค์๋โ - ๋ค๋ฅธ ๋จ์ด๋ค์ ํน์ฑ
- Value (V): โ์ค์ ๋ด์ฉโ - ๋จ์ด์ ์ค์ ์ ๋ณด
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
def simple_attention_example():
"""๊ฐ๋จํ Attention ์์"""
# ์์ ๋ฌธ์ฅ: "I love AI"
# ๊ฐ ๋จ์ด๋ฅผ 3์ฐจ์ ๋ฒกํฐ๋ก ํํ
sentence = torch.tensor([
[1.0, 0.0, 0.0], # "I"
[0.0, 1.0, 0.0], # "love"
[0.0, 0.0, 1.0] # "AI"
])
# Query, Key, Value ๊ณ์ฐ (๋จ์ํ)
Q = sentence # Query: ๊ฐ ๋จ์ด๊ฐ ๋ฌด์์ ์ฐพ๊ณ ์๋?
K = sentence # Key: ๊ฐ ๋จ์ด์ ํน์ฑ
V = sentence # Value: ๊ฐ ๋จ์ด์ ์ค์ ์ ๋ณด
# Attention Score ๊ณ์ฐ
attention_scores = torch.matmul(Q, K.transpose(-2, -1))
print("Attention Scores:")
print(attention_scores)
# Softmax๋ก ํ๋ฅ ๋ณํ
attention_weights = F.softmax(attention_scores, dim=-1)
print("\nAttention Weights:")
print(attention_weights)
# ์ต์ข
์ถ๋ ฅ
output = torch.matmul(attention_weights, V)
print("\nFinal Output:")
print(output)
# ์คํ
simple_attention_example()
๐ SHCA โ MHCA ํ๋ช ์ ๋ณํ
SHCA (2015๋ ) | MHCA (2017๋ ) |
---|---|
๐ฏ 1๊ฐ ๊ด์ | ๐ญ 8๊ฐ ๊ด์ |
๐ Cross Attention๋ง | ๐ Self + Cross |
๐ RNN ์์กด | ๐ซ RNN ์ ๊ฑฐ |
๐ ์์ฐจ ์ฒ๋ฆฌ | โก ๋ณ๋ ฌ ์ฒ๋ฆฌ |
๐ ๋จ์ ๊ฐ์คํฉ | ๐ง ๋ณตํฉ ํํ |
๐ช Multi-Head์ ํจ๊ณผ
๊ฐ head๊ฐ ์๋ก ๋ค๋ฅธ ์ข ๋ฅ์ ๊ด๊ณ๋ฅผ ํ์ตํฉ๋๋ค:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# ์์: "The cat sat on the mat" ๋ถ์
sentence = "The cat sat on the mat"
# Head 1: ๋ฌธ๋ฒ์ ๊ด๊ณ ํ์ต
head_1_attention = [
# "cat" โ "The" (๊ด์ฌ-๋ช
์ฌ ๊ด๊ณ)
# "sat" โ "cat" (์ฃผ์ด-๋์ฌ ๊ด๊ณ)
# "on" โ "sat" (๋์ฌ-์ ์น์ฌ ๊ด๊ณ)
]
# Head 2: ์๋ฏธ์ ๊ด๊ณ ํ์ต
head_2_attention = [
# "cat" โ "mat" (๊ณ ์์ด๊ฐ ๋งคํธ์ ๊ด๋ จ)
# "sat" โ "on" (์๋ ๋์๊ณผ ์์น)
]
# Head 3: ์์น์ ๊ด๊ณ ํ์ต
head_3_attention = [
# ์ธ์ ํ ๋จ์ด๋ค ๊ฐ์ ๊ด๊ณ
# "The" โ "cat", "cat" โ "sat" ๋ฑ
]
# Head 4-8: ๋ค๋ฅธ ์ถ์์ ๊ด๊ณ๋ค...
๐ฅ Multi-Head์ ํ์ ์ ์ฅ์ :
- ๐ฏ ๋ค์ํ ๊ด์ : ๋ฌธ๋ฒ, ์๋ฏธ, ์์น ๋ฑ ๋์ ํ์ต
- ๐ง ํ๋ถํ ํํ: ๋ณต์กํ ์ธ์ด ํจํด ํฌ์ฐฉ
- โก ๋ณ๋ ฌ ๊ณ์ฐ: ๋ชจ๋ head๊ฐ ๋์์ ์ฒ๋ฆฌ
- ๐ ์ฑ๋ฅ ํฅ์: ์ค์ ๋ก ๋ฒ์ญ/์ดํด ์ฑ๋ฅ ๋ํญ ๊ฐ์
๐ก ๊ฒฐ๊ณผ: Single-Head์ ํ๊ณ๋ฅผ ์์ ํ ๊ทน๋ณต! ๐
๐ฅ Transformer์ 3๋ ํ์
๊ธฐ์กด Attention (2015) | Transformer Attention (2017) |
---|---|
๐ RNN ํ์ | ๐ซ RNN ์์ |
๐ EncoderโDecoder๋ง | ๐ Self-Attention |
๐ฏ ๋จ์ผ Head | ๐ญ Multi-Head |
๐ ์์ฐจ ์ฒ๋ฆฌ | โก ๋ณ๋ ฌ ์ฒ๋ฆฌ |
ํ์ ํฌ์ธํธ:
- ๐ง Self-Attention: ๊ฐ์ ์ํ์ค ๋ด์์ ๋ชจ๋ ์์น๊ฐ ์๋ก ๊ด๊ณ ํ์ต
- ๐ญ Multi-Head: ์ฌ๋ฌ ๊ด์ ์์ ๋์์ attention ๊ณ์ฐ
- โก ๋ณ๋ ฌํ: RNN ์์ด๋ ์ํ์ค ์ฒ๋ฆฌ ๊ฐ๋ฅ
๐ ๋ฐ์ ๊ณผ์ ์์ฝ
timeline
title Attention ๋ฐ์ ์ฌ
2014 : Neural Machine Translation
: Encoder-Decoder ๋ฑ์ฅ
2015 : Bahdanau Attention
: ์ฒซ ๋ฒ์งธ Attention ๋ฉ์ปค๋์ฆ
: Luong Attention
2017 : Transformer
: "Attention is All You Need"
: Self-Attention ํ๋ช
2018+ : BERT, GPT ๋ฑ์ฅ
: Transformer ๊ธฐ๋ฐ ๋ชจ๋ธ๋ค
๐ก ๊ฒฐ๋ก : Attention์ ๊ธฐ์กด์ ์๋ ๊ฐ๋ ์ด์ง๋ง, Transformer๊ฐ ์์ ํ ์๋ก์ด ๋ ๋ฒจ๋ก ๋์ด์ฌ๋ ธ์ต๋๋ค! ๐
ํต์ฌ ์์ด๋์ด:
- ๐ซ RNN/LSTM ์์ด๋ ์ํ์ค ๋ฐ์ดํฐ ์ฒ๋ฆฌ ๊ฐ๋ฅ
- โก ๋ณ๋ ฌ์ฒ๋ฆฌ ๊ฐ๋ฅ์ผ๋ก ํ์ต ์๋ ๋ํญ ํฅ์
- ๐ฏ Self-Attention ๋ฉ์ปค๋์ฆ์ผ๋ก ์ฅ๊ฑฐ๋ฆฌ ์์กด์ฑ ํด๊ฒฐ
๐๏ธ Transformer ๋ธ๋ก ๊ตฌํ
์ด์ ์์ ํ Transformer ๋ธ๋ก์ ๋ง๋ค์ด๋ด ์๋ค! ๐
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
class TransformerBlock(nn.Module):
def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
super().__init__()
# Multi-Head Attention
self.attention = MultiHeadAttention(d_model, n_heads, dropout)
# Feed-Forward Network
self.feed_forward = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(d_ff, d_model)
)
# Layer Normalization
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
# Dropout
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
"""
Args:
x: Input tensor (batch_size, seq_len, d_model)
mask: Optional attention mask
Returns:
output: Transformer block output
"""
# 1. Multi-Head Attention + Residual Connection + Layer Norm
attn_output, attn_weights = self.attention(x, x, x, mask)
x = self.ln1(x + self.dropout(attn_output))
# 2. Feed-Forward + Residual Connection + Layer Norm
ff_output = self.feed_forward(x)
x = self.ln2(x + self.dropout(ff_output))
return x, attn_weights
# ์์ ํ Transformer ๋ชจ๋ธ! ๐
class SimpleTransformer(nn.Module):
def __init__(self, vocab_size, d_model=512, n_heads=8, n_layers=6,
d_ff=2048, max_seq_len=5000, dropout=0.1):
super().__init__()
self.d_model = d_model
# Embedding layers
self.token_embedding = nn.Embedding(vocab_size, d_model)
self.position_embedding = nn.Embedding(max_seq_len, d_model)
# Transformer blocks
self.transformer_blocks = nn.ModuleList([
TransformerBlock(d_model, n_heads, d_ff, dropout)
for _ in range(n_layers)
])
# Final layer norm
self.ln_final = nn.LayerNorm(d_model)
# Dropout
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
"""
Args:
x: Input token ids (batch_size, seq_len)
mask: Optional attention mask
Returns:
output: Transformer output (batch_size, seq_len, d_model)
attention_weights: List of attention weights from each layer
"""
batch_size, seq_len = x.size()
# Token embeddings
token_emb = self.token_embedding(x)
# Position embeddings
positions = torch.arange(seq_len, device=x.device).unsqueeze(0).expand(batch_size, -1)
pos_emb = self.position_embedding(positions)
# Combine embeddings
x = self.dropout(token_emb + pos_emb)
# Apply transformer blocks
attention_weights = []
for block in self.transformer_blocks:
x, attn_weights = block(x, mask)
attention_weights.append(attn_weights)
# Final layer norm
x = self.ln_final(x)
return x, attention_weights
# ์ค์ ์ฌ์ฉ ์์! ๐ฏ
def test_transformer():
"""Transformer ๋ชจ๋ธ ํ
์คํธ"""
# ๋ชจ๋ธ ์์ฑ
vocab_size = 10000
model = SimpleTransformer(vocab_size)
# ๋๋ฏธ ๋ฐ์ดํฐ ์์ฑ
batch_size, seq_len = 2, 50
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
# Forward pass
output, attention_weights = model(input_ids)
print(f"๐ Transformer Results:")
print(f"Input shape: {input_ids.shape}")
print(f"Output shape: {output.shape}")
print(f"Number of layers: {len(attention_weights)}")
# ๋ชจ๋ธ ํ๋ผ๋ฏธํฐ ์ ๊ณ์ฐ
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")
return model, output, attention_weights
# ์คํ
model, output, attention_weights = test_transformer()
๐ฏ ์์ฝ ๋ฐ ๋ง๋ฌด๋ฆฌ
๐ก ํต์ฌ ํฌ์ธํธ
โAttention is All You Needโ - ์ ๋ง๋ก Attention๋ง์ผ๋ก๋ ์ถฉ๋ถํ์ต๋๋ค! ๐ฏ
RNN/LSTM ์์ด๋ ์ํ์ค ๋ชจ๋ธ๋ง์ด ๊ฐ๋ฅํ๋ฉฐ, ๋ณ๋ ฌ์ฒ๋ฆฌ๋ก ํจ์ฌ ๋น ๋ฅด๊ณ ํจ์จ์ ์ ๋๋ค.
๐ ์ถํํฉ๋๋ค! ์ด์ ์ฌ๋ฌ๋ถ์ Transformer๋ฅผ ์์ ํ ์ดํดํ๊ณ ๊ตฌํํ ์ ์์ต๋๋ค! ๐ช
๐ ์ฐธ๊ณ ์๋ฃ
- Attention Is All You Need (์๋ ผ๋ฌธ)
- The Illustrated Transformer
- PyTorch Official Tutorial
- Hugging Face Transformers