๐ง SAM2 Hands-On Practice!! : SAM2 ์ค์ต!! with Python
๐งฌ (English) EfficientSAM Practice!!
Today, letโs revisit the practice of the new SAM model SAM2
, which we studied theoretically in a previous post
and also experimented with before!
This time, instead of using ultralytics
, weโll fetch the model directly from HuggingFace!!
Howeverโฆ compared to EfficientSAM, the results of this SAM2 arenโt really satisfying to meโฆ
๐ง 1. Installation & Setup
Clone directly from GitHub and install.
I created a virtual environment called sam2
beforehand!
1
2
3
4
conda create --name sam2 python=3.12
git clone https://github.com/facebookresearch/sam2.git && cd sam2
pip install -e .
Additionally, although the SAM2 GitHub readme suggests downloading the model via ./download_ckpts.sh
,
the actual code has a hf_hub_download
function to fetch weights โ so we can skip that!!
๐ผ๏ธ 2. Image Segmentation
Just like in the EfficientSAM practice,
Iโll use a dog photo with only two prompt points!!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import torch
import numpy as np
from PIL import Image, ImageDraw
import os
from sam2.sam2_image_predictor import SAM2ImagePredictor
# 1. Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# 2. Load SAM2 model
predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large")
# 3. Load image
image_path = "./EfficientSAM_gdino/figs/examples/dogs.jpg"
output_image_path = "output_masked_image.png"
image_pil = Image.open(image_path).convert("RGB")
image_np = np.array(image_pil)
# -----------------------------------------------------
# 4. Prepare prompt
# -----------------------------------------------------
input_points = torch.tensor([[[580, 350], [650, 350]]], device=device)
input_labels = torch.tensor([[1, 1]], device=device)
# 5. Run prediction
with torch.inference_mode(), torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
predictor.set_image(image_np)
masks, scores, _ = predictor.predict(
point_coords=input_points,
point_labels=input_labels,
)
mask = masks[0]
print(f"mask shape :{mask.shape}")
# Create black canvas
segmented_image_np = np.zeros_like(image_np)
# Convert mask
binary_mask = (mask > 0.5)
# Apply mask
segmented_image_np[binary_mask] = image_np[binary_mask]
# Convert back to image
result_image = Image.fromarray(segmented_image_np)
# Draw prompt points
draw_result = ImageDraw.Draw(result_image)
points_np = input_points[0].cpu().numpy()
labels_np = input_labels[0].cpu().numpy()
for i, (x, y) in enumerate(points_np):
label = labels_np[i]
fill_color = "green" if label == 1 else "red"
outline_color = "white"
radius = 5
if label == 1:
draw_result.ellipse((x - radius, y - radius, x + radius, y + radius),
fill=fill_color, outline=outline_color, width=1)
else:
draw_result.line((x - radius, y - radius, x + radius, y + radius),
fill=fill_color, width=2)
draw_result.line((x + radius, y - radius, x - radius, y + radius),
fill=fill_color, width=2)
result_image.save(output_image_path)
print(f"Result saved to '{output_image_path}'")
Result image is!!!??
Totally disappointingโฆ
So, I tried with 4 prompt points and switched the model to sam2.1_hiera_large.pt
!!
Still disappointingโฆ
GPT explained itโs due to prompt interpretation differences.
But honestly, I still prefer EfficientSAM!!
1
2
3
4
EfficientSAM didnโt actually โperform better,โ but rather interpreted your imperfect prompts more โforgivingly.โ
SAM2, on the other hand, is much more powerful and precise, so to fully leverage it, you must provide more accurate prompts.
As suggested before, if you place several points on the dogsโ body and head, youโll see that SAM2 produces masks that are far more refined and higher quality than EfficientSAM.
๐งช 3. Video Segmentation
Starting with the result!!! This one looks good~~
Code is below:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import torch
import numpy as np
import cv2
import os
from sam2.sam2_video_predictor import SAM2VideoPredictor
from moviepy.editor import VideoFileClip
# 1. Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Paths
output_video_path = "output_segmented_video_50_55s.mp4"
clipped_video_path = "temp_clip.mp4"
# 3. Load video
cap = cv2.VideoCapture(clipped_video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
cap.release()
print(f"Loaded clip: {width}x{height}, {total_frames} frames, {fps:.2f} FPS")
# 4. Load SAM2 model
print("Loading SAM2 video predictor...")
predictor = SAM2VideoPredictor.from_pretrained("facebook/sam2-hiera-large")
# -----------------------------------------------------
# 5. Prompt setup
# -----------------------------------------------------
prompt_frame_idx = 0 # frame index to add prompt
prompt_obj_id = 1 # unique object ID
# Coordinates and labels
points = np.array([[width // 2, height // 2]], dtype=np.float32)
labels = np.array([1], dtype=np.int32)
# -----------------------------------------------------
# 6. Initialize and predict
# -----------------------------------------------------
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
print("Initializing predictor state...")
state = predictor.init_state(clipped_video_path)
print("Adding prompt on first frame...")
_, _, masks = predictor.add_new_points_or_box(
inference_state=state,
frame_idx=prompt_frame_idx,
obj_id=prompt_obj_id,
points=points,
labels=labels,
)
# 7. Propagate in video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
print("Propagating masks across video...")
for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
original_frame = video_frames[frame_idx]
segmented_image_np = np.full_like(original_frame, 255)
if prompt_obj_id in object_ids:
mask_logits = masks[0][0].cpu().numpy()
binary_mask_before_resize = (mask_logits > 0.0).astype(np.uint8)
resized_mask = cv2.resize(binary_mask_before_resize, (width, height), interpolation=cv2.INTER_NEAREST)
boolean_mask = (resized_mask == 1)
segmented_image_np[boolean_mask] = original_frame[boolean_mask]
# Draw red point
for x, y in points:
cv2.circle(segmented_image_np, (int(x), int(y)), radius=5, color=(255, 0, 0), thickness=-1)
output_frame = cv2.cvtColor(segmented_image_np, cv2.COLOR_RGB2BGR)
out_writer.write(output_frame)
print(f"\r- Processing: frame {frame_idx + 1}/{total_frames}", end="")
out_writer.release()
print(f"\nVideo segmentation complete! Saved to '{output_video_path}'")
SAM2โs video tracking is really impressive!!!
๐งฌ (ํ๊ตญ์ด) EfficientSAM ์ค์ต!!
์ค๋์ ์์ ํฌ์คํ
์์ ์ด๋ก ์ ๋ํ์ฌ ๊ณต๋ถํด๋ณด์๋!
๊ทธ๋ฆฌ๊ณ ์ค์ต๋ ํด๋ณด์๋ ์๋ก์ด SAM ๋ชจ๋ธ! SAM2
์ ์ค์ต์ ๋ค์ ํ ๋ฒ ์งํํด๋ณด๊ฒ ์ต๋๋ค!!
์ด๋ฒ์ ์ง๋๋ฒ๊ณผ ๋ฌ๋ฆฌ ultralytics
๊ฐ ์๋๋ผ huggingface์์ ๋ชจ๋ธ์ ๋ฐ์์๊ณ ๊ณ !!
๊ทธ๋ฐ๋ฐ,, ์ด SAM2, EfficientSAM๋ณด๋ค๋ ๊ฒฐ๊ณผ๋ฌผ์ด ๋ง์๋ค์ง ์๊ตฌ๋ง์ ,,
๐ง 1. ์ค์น ๋ฐ ์ ์
GitHub์์ ์ง์ ํด๋ก ํ์ฌ ์ค์นํฉ๋๋ค.
์ ๋ ๊ทธ์ ์ sam2๋ผ๋ ๊ฐ์ํ๊ฒฝ์ ๋ง๋ค๊ณ ์งํํ์ต๋๋ค!!
1
2
3
4
conda create --name sam2 python=3.12
git clone https://github.com/facebookresearch/sam2.git && cd sam2
pip install -e .
์ถ๊ฐ๋ก SAM2 git์ readme์์๋ ./download_ckpts.sh
๋ก ๋ชจ๋ธ์ ๋ฐ์ผ๋ผ๊ณ ํ์ง๋ง,
์ค์ ์ฝ๋๋ hf_hub_download
๋ฅผ ํตํด weight๋ฅผ ๋ฐ์์ค๋ ๊ธฐ๋ฅ์ด ์๊ธฐ์ ์คํต!!
๐ผ๏ธ 2. ์ด๋ฏธ์ง ์ธ๊ทธ๋ฉํ ์ด์
์ง๋ EfficientSAM ์ค์ต๊ณผ ๋์ผํ๊ฒ!! ๋ฉ๋ฉ์ด ์ฌ์ง์ผ๋ก ์งํํด๋ณด๊ฒ ์ต๋๋ค!!
ํ๋กฌํฌํธ๋~ ์ ์ 2๊ฐ๋ง!!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
import numpy as np
from PIL import Image, ImageDraw
import os
from sam2.sam2_image_predictor import SAM2ImagePredictor
# 1. ๊ธฐ๋ณธ ์ค์
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# 2. SAM2 ๋ชจ๋ธ ๋ถ๋ฌ์ค๊ธฐ
predictor = SAM2ImagePredictor.from_pretrained("facebook/sam2-hiera-large")
# 3. ์ด๋ฏธ์ง ์ค๋น
image_path = "./EfficientSAM_gdino/figs/examples/dogs.jpg"
output_image_path = "output_masked_image.png"
image_pil = Image.open(image_path).convert("RGB")
image_np = np.array(image_pil)
# -----------------------------------------------------
# 4. ํ๋กฌํํธ ์ค๋น
# -----------------------------------------------------
# input_points๋ฅผ (batch, num_points, 2) ํํ์ 3D ํ
์๋ก ๋ง๋ญ๋๋ค.
input_points = torch.tensor([[[580, 350], [650, 350]]], device=device)
input_labels = torch.tensor([[1, 1]], device=device)
# input_labels = torch.tensor([[1, 1, 1, 1]], device=device) # ๋ชจ๋ ์ ์ด ์ ๊ฒฝ
# 5. ์์ธก ์คํ
with torch.inference_mode(), torch.autocast(device_type=device.split(":")[0], dtype=torch.bfloat16):
predictor.set_image(image_np)
masks, scores, _ = predictor.predict(
point_coords=input_points,
point_labels=input_labels,
)
mask = masks[0]
print(f"mask shape :{mask.shape}")
# ์๋ณธ ์ด๋ฏธ์ง์ ๊ฐ์ ํฌ๊ธฐ์ ์์ ํ ๊ฒ์ ์ NumPy ๋ฐฐ์ด ์์ฑ
segmented_image_np = np.zeros_like(image_np)
# ๋ง์คํฌ๊ฐ True์ธ ์์ญ์๋ง ์๋ณธ ์ด๋ฏธ์ง ํฝ์
์ ๋ณต์ฌ
# mask๋ boolean ๋ฐฐ์ด (True/False)์ด๊ฑฐ๋ 0~1 ์ฌ์ด์ float ๋ฐฐ์ด์ผ ์ ์์ต๋๋ค.
# float ๋ฐฐ์ด์ด๋ผ๋ฉด ์๊ณ๊ฐ์ ์ ์ฉํ์ฌ boolean ๋ง์คํฌ๋ก ๋ง๋ญ๋๋ค.
binary_mask = (mask > 0.5) # 0.5๋ฅผ ๊ธฐ์ค์ผ๋ก True/False ๋ง์คํฌ ์์ฑ
# ๋ง์คํฌ๊ฐ True์ธ ์์น์ ์๋ณธ ์ด๋ฏธ์ง ํฝ์
์ ํ ๋น
segmented_image_np[binary_mask] = image_np[binary_mask]
# NumPy ๋ฐฐ์ด์ PIL Image๋ก ๋ณํ
result_image = Image.fromarray(segmented_image_np)
# ํ๋กฌํํธ ์ ๋ ์ด๋ฏธ์ง ์์ ๊ทธ๋ฆฌ๊ธฐ (์ ํ ์ฌํญ)
draw_result = ImageDraw.Draw(result_image)
points_np = input_points[0].cpu().numpy()
labels_np = input_labels[0].cpu().numpy()
for i, (x, y) in enumerate(points_np):
label = labels_np[i]
fill_color = "green" if label == 1 else "red"
outline_color = "white"
radius = 5
if label == 1:
draw_result.ellipse((x - radius, y - radius, x + radius, y + radius), fill=fill_color, outline=outline_color, width=1)
else:
draw_result.line((x - radius, y - radius, x + radius, y + radius), fill=fill_color, width=2)
draw_result.line((x + radius, y - radius, x - radius, y + radius), fill=fill_color, width=2)
result_image.save(output_image_path)
print(f"๊ฒฐ๊ณผ ์ด๋ฏธ์ง๊ฐ '{output_image_path}'์ ์ ์ฅ๋์์ต๋๋ค.")
๊ฒฐ๊ณผ ์ด๋ฏธ์ง๋!!!??
์์ ์ค๋ง์ค๋ฌ์ด๋ฐ,,??
๊ทธ๋์, ํ๋กฌํฌํธ ์ ์ 4๊ฐ๋ก, ๋ชจ๋ธ๋ sam2.1_hiera_large.pt
๋ก ๋ฐ๊ฟ์ ํด๋ณด์๋๋ฐ!!
๊ทธ๋๋ ๊ฒฐ๊ณผ๊ฐ ์ค๋ง์ค๋ฝ๋ค์,, GPT์ ๋ฌผ์ด๋ณด๋ ํ๋กฌํฌํธ์ ํด์์ฐจ์ด๋ผ๊ณ ํ๋๋ฐ,,
์ ๋ ๊ทธ๋๋ EfficientSAM์ด ์ข๋ค์!!
1
2
3
EfficientSAM์ด ๋ "์" ํ๋ค๊ธฐ๋ณด๋ค๋, ์ฌ์ฉ์์ ๋ถ์ ํํ ํ๋กฌํํธ๋ฅผ ๋ "๊ด๋ํ๊ฒ" ํด์ํด ์ค ๊ฒ์
๋๋ค.
๋ฐ๋ฉด์ SAM2๋ ํจ์ฌ ๊ฐ๋ ฅํ๊ณ ์ ๋ฐํ ๋๊ตฌ์ด๊ธฐ ๋๋ฌธ์, ๊ทธ ์ฑ๋ฅ์ ์ ๋๋ก ํ์ฉํ๋ ค๋ฉด ์ฌ์ฉ์๋ ๋ ์ ํํ ํ๋กฌํํธ๋ฅผ ์ ๊ณตํด์ผ ํฉ๋๋ค. ์ด์ ๋ต๋ณ์์ ์ ์ํด ๋๋ฆฐ ๊ฒ์ฒ๋ผ ๊ฐ์์ง๋ค์ ๋ชธํต๊ณผ ๋จธ๋ฆฌ์ ์ฌ๋ฌ ๊ฐ์ ์ ์ ์ฐ์ด์ฃผ์๋ฉด, SAM2๊ฐ EfficientSAM๋ณด๋ค ํจ์ฌ ๋ ๊ณ ํ์ง์ ์ ๊ตํ ๋ง์คํฌ๋ฅผ ์์ฑํ๋ ๊ฒ์ ํ์ธํ์ค ์ ์์ ๊ฒ๋๋ค.
๐งช 3. ์์ Segmentation
๊ฒฐ๊ณผ๋ถํฐ!!! ๋ง์๋๋๋ฐ์~~
์ฝ๋๋ ์๋์ ๊ฐ์ต๋๋ค!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import torch
import numpy as np
import cv2
import os
from sam2.sam2_video_predictor import SAM2VideoPredictor
from moviepy.editor import VideoFileClip
# 1. ๊ธฐ๋ณธ ์ค์
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# ์๋ณธ ๋ฐ ์ถ๋ ฅ ๊ฒฝ๋ก ์ค์
output_video_path = "output_segmented_video_50_55s.mp4"
clipped_video_path = "temp_clip.mp4"
# 3. ๋น๋์ค ๋ก๋ฉ
cap = cv2.VideoCapture(clipped_video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
cap.release()
print(f"ํด๋ฆฝ ๋ก๋ ์๋ฃ: {width}x{height}, {total_frames} ํ๋ ์, {fps:.2f} FPS")
# 4. SAM2 ๋ชจ๋ธ ๋ก๋
print("SAM2 ๋น๋์ค ์์ธก๊ธฐ ๋ชจ๋ธ์ ๋ก๋ํฉ๋๋ค...")
predictor = SAM2VideoPredictor.from_pretrained("facebook/sam2-hiera-large")
# -----------------------------------------------------
# 5. ํ๋กฌํํธ ์ค๋น (โ
โ
โ
โ
โ
์ด ๋ถ๋ถ์ด ์์ ๋์์ต๋๋ค โ
โ
โ
โ
โ
)
# -----------------------------------------------------
prompt_frame_idx = 0 # ํ๋กฌํํธ๋ฅผ ์ ์ฉํ ํ๋ ์ ์ธ๋ฑ์ค (0์ ์ฒซ ๋ฒ์งธ ํ๋ ์)
prompt_obj_id = 1 # ์ถ์ ํ ๊ฐ์ฒด์ ๊ณ ์ ID (์ฒซ ๋ฒ์งธ ๊ฐ์ฒด์ด๋ฏ๋ก 1)
# ์ ์ขํ: (NumPoints, 2) ํํ์ NumPy ๋ฐฐ์ด
points = np.array([[width // 2, height // 2]], dtype=np.float32)
# ๋ ์ด๋ธ: (NumPoints,) ํํ์ NumPy ๋ฐฐ์ด
labels = np.array([1], dtype=np.int32)
# -----------------------------------------------------
# 6. ๋ชจ๋ธ ์ด๊ธฐํ ๋ฐ ์ฒซ ํ๋ ์ ์์ธก
# -----------------------------------------------------
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
print("์์ธก๊ธฐ ์ํ๋ฅผ ์ด๊ธฐํํฉ๋๋ค...")
state = predictor.init_state(clipped_video_path)
print("์ฒซ ๋ฒ์งธ ํ๋ ์์ ํ๋กฌํํธ๋ฅผ ์ถ๊ฐํฉ๋๋ค...")
# ์์ ์ฝ๋์ ๋ง์ถฐ ํจ์ ์ด๋ฆ๊ณผ ์ธ์๋ฅผ ๋ชจ๋ ๋ณ๊ฒฝ
_, _, masks = predictor.add_new_points_or_box(
inference_state=state,
frame_idx=prompt_frame_idx,
obj_id=prompt_obj_id,
points=points,
labels=labels,
)
# 7. ๋น๋์ค ์ ํ ๋ฐ ๊ฒฐ๊ณผ ์ ์ฅ
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out_writer = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
print("ํด๋ฆฝ ์ ์ฒด์ ๋ง์คํฌ๋ฅผ ์ ํํ๊ณ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํฉ๋๋ค...")
for frame_idx, object_ids, masks in predictor.propagate_in_video(state):
original_frame = video_frames[frame_idx]
# segmented_image_np = np.zeros_like(original_frame)
# segmented_image_np = np.zeros_like(original_frame)
segmented_image_np = np.full_like(original_frame, 255)
# prompt_obj_id (1)๊ฐ ์ถ์ ๋๊ณ ์๋์ง ํ์ธ
if prompt_obj_id in object_ids:
mask_logits = masks[0][0].cpu().numpy()
binary_mask_before_resize = (mask_logits > 0.0).astype(np.uint8)
resized_mask = cv2.resize(binary_mask_before_resize, (width, height), interpolation=cv2.INTER_NEAREST)
# 4. ๋ฆฌ์ฌ์ด์ฆ๋ ๋ง์คํฌ๋ฅผ boolean ํ์
์ผ๋ก ์ต์ข
๋ณํ
boolean_mask = (resized_mask == 1)
# 5. ์ฌ๋ฐ๋ฅธ ๋ง์คํฌ๋ก ์ธ๋ฑ์ฑ
segmented_image_np[boolean_mask] = original_frame[boolean_mask]
# # ์ด์ shape์ด ์ผ์นํ๋ฏ๋ก ์ ์์ ์ผ๋ก ์๋ํฉ๋๋ค.
# segmented_image_np[mask] = original_frame[mask]
# --- ์ถ๊ฐ๋ ๋ถ๋ถ: ๋ชจ๋ ํ๋ ์์ ํ๋กฌํํธ ์ ๊ทธ๋ฆฌ๊ธฐ ---
for x, y in points:
# cv2.circle์ ์ฌ์ฉํ์ฌ ๋นจ๊ฐ์ ์ ์ ๊ทธ๋ฆฝ๋๋ค.
# segmented_image_np๋ RGB ์ํ์ด๋ฏ๋ก, ๋นจ๊ฐ์์ (255, 0, 0)์
๋๋ค.
# thickness=-1์ ์ฑ์์ง ์์ ์๋ฏธํฉ๋๋ค.
cv2.circle(segmented_image_np, (int(x), int(y)), radius=5, color=(255, 0, 0), thickness=-1)
output_frame = cv2.cvtColor(segmented_image_np, cv2.COLOR_RGB2BGR)
out_writer.write(output_frame)
print(f"\r- ์ฒ๋ฆฌ ์ค: ํ๋ ์ {frame_idx + 1}/{total_frames}", end="")
out_writer.release()
print(f"\n๋น๋์ค ๋ถํ ์๋ฃ! ๊ฒฐ๊ณผ๊ฐ '{output_video_path}'์ ์ ์ฅ๋์์ต๋๋ค.")
SAM2, ์์์์ ํธ๋์ด์ฑ์ ์ ๋ง ๋ง์๋๋๊ตฐ์!^^