๐ง EfficientSAM Hands-On Practice!! : EfficientSAM ์ค์ต!! with Python
๐งฌ (English) EfficientSAM Practice!!
Today, following the previous post where we studied the theory,
we will conduct a hands-on experiment with the lightweight segmentation model EfficientSAM
!
โ๏ธ Main experiment objectives:
- Use EfficientSAM with GPU
- Prompt-based segmentation
- Prompts can be given either as Box or Point
๐ง 1. Installation & Setup
EfficientSAM is not a pip package like Hugging Face Transformers,
so you need to clone the GitHub repository directly.
1
2
3
# 1. Clone the EfficientSAM repository
git clone https://github.com/ChaoningZhang/EfficientSAM.git
cd EfficientSAM
- Conveniently, the
weights
folder already contains the pretrained modelefficient_sam_vits.pt.zip
.
๐ผ๏ธ 2. Basic Image Segmentation โ CPU & GPU
First, if you run the provided EfficientSAM_example.py
in the repo,
you can check that it works correctly.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam
from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
models = {}
# Build the EfficientSAM-Ti model.
models['efficientsam_ti'] = build_efficient_sam_vitt()
# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
zip_ref.extractall("weights")
# Build the EfficientSAM-S model.
models['efficientsam_s'] = build_efficient_sam_vits()
# Build the SqueezeSAM model.
# models['squeeze_sam'] = build_squeeze_sam()
# load an image
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np)
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]])
input_labels = torch.tensor([[[1, 1]]])
# Run inference for both EfficientSAM-Ti and EfficientSAM-S models.
for model_name, model in models.items():
print('Running inference using ', model_name)
predicted_logits, predicted_iou = model(
sample_image_tensor[None, ...],
input_points,
input_labels,
)
sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
predicted_logits = torch.take_along_dim(
predicted_logits, sorted_ids[..., None, None], dim=2
)
# The masks are already sorted by their predicted IOUs.
# The first dimension is the batch size (we have a single image. so it is 1).
# The second dimension is the number of masks we want to generate (in this case, it is only 1)
# The third dimension is the number of candidate masks output by the model.
# For this demo we use the first mask.
mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()
masked_image_np = sample_image_np.copy().astype(np.uint8) * mask[:,:,None]
Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")
# Check: parameter device
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)
You will see that the dog is successfully segmented:
But the device is CPU by default.
If you change it as follows, you can confirm the logs showing execution on GPU:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam
from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
import contextlib
# -----------------------------------
# Device setting
# -----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# -----------------------------------
# Load models (+ move to GPU)
# -----------------------------------
models = {}
# EfficientSAM-Ti
models['efficientsam_ti'] = build_efficient_sam_vitt().to(device).eval()
# EfficientSAM-S (unzip weights first)
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
zip_ref.extractall("weights")
models['efficientsam_s'] = build_efficient_sam_vits().to(device).eval()
# SqueezeSAM (optional)
# models['squeeze_sam'] = build_squeeze_sam().to(device).eval()
# -----------------------------------
# Prepare input (+ move to GPU)
# -----------------------------------
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np).to(device) # [C,H,W] float32 on device
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device) # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device) # [B=1, N=1, K=2]
# -----------------------------------
# Inference (AMP for CUDA only)
# -----------------------------------
amp_ctx = torch.autocast(device_type="cuda") if device.type == "cuda" else contextlib.nullcontext()
for model_name, model in models.items():
print('Running inference using', model_name)
with torch.inference_mode(), amp_ctx:
predicted_logits, predicted_iou = model(
sample_image_tensor[None, ...], # [1,C,H,W]
input_points,
input_labels,
)
# Sort and select top mask
sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
predicted_logits = torch.take_along_dim(predicted_logits, sorted_ids[..., None, None], dim=2)
# Use the first candidate mask
mask = (predicted_logits[0, 0, 0] >= 0).to(torch.uint8).cpu().numpy() # [H,W], uint8 (0/1)
# Save masked image
masked_image_np = (sample_image_np.astype(np.uint8) * mask[:, :, None])
Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")
# Check: parameter device
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)
Result log:
1
2
3
4
Running inference using efficientsam_ti
Running inference using efficientsam_s
Model param device: cuda:0
Image tensor device: cuda:0
๐งช 3. Experimenting with Prompts
EfficientSAM performs segmentation based on either box or point prompts.
The most basic point prompt is already shown in the sample code above:
1
2
3
4
5
...
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device) # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device) # [B=1, N=1, K=2]
...
This means two positive points were provided:
- [580, 350], [650, 350] โ user-clicked coordinates.
- 1, 1 โ positive points (included in the mask).
If the label had been 0, it would mean a negative point (outside the target area).
So, what about bbox?
1
2
3
4
5
6
...
box_pts = np.array([[x1, y1], [x2, y2]], dtype=np.int64)
box_lbl = np.array([2, 3], dtype=np.int64)
input_points = torch.from_numpy(box_pts)[None, None, ...].to(device) # [1,1,2,2]
input_labels = torch.from_numpy(box_lbl)[None, None, ...].to(device) # [1,1,2]
...
With this format:
- Give two points and set labels to [2, 3], and it will be interpreted as a bounding box.
- Label 2 = top-left corner, Label 3 = bottom-right corner.
โ๏ธ 4. Real-World Application โ GroundingDINO + EfficientSAM
- Remember the open-vocabulary object detection tool
GroundingDINO
? - Make sure you have GroundingDINO set up (see previous post).
- We will use
GroundingDINO
to generate bboxes, - and then use
EfficientSAM
for segmentation.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 1. Package imports, model loading & variable setup
import os, contextlib, zipfile, numpy as np, torch, cv2
from PIL import Image
from torchvision import transforms
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
from groundingdino.util.inference import load_model, load_image, predict
IMAGE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples/brush_with_toothbrush_000268.jpg"
TEXT_PROMPT = "toothbrush"
SAVE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples"
img_name = os.path.splitext(os.path.basename(IMAGE_PATH))[0]
GDINO_CONFIG = "{my_dir}/EfficientSAM_gdino/grounding_dino/config/GroundingDINO_SwinT_OGC.py"
GDINO_WEIGHTS = "{my_dir}/EfficientSAM_gdino/grounding_dino/weights/groundingdino_swint_ogc.pth"
EFSAM_S_ZIP = "{my_dir}/EfficientSAM_gdino/weights/efficient_sam_vits.pt.zip"
OUTPUT_DIR = os.path.join(SAVE_PATH, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[Device]", device)
# ---------- GroundingDINO ----------
gdino = load_model(GDINO_CONFIG, GDINO_WEIGHTS)
image_source, image_pre = load_image(IMAGE_PATH)
# ---------- EfficientSAM ----------
models = {}
models["efficientsam_ti"] = build_efficient_sam_vitt().to(device).eval()
if os.path.isfile(EFSAM_S_ZIP):
with zipfile.ZipFile(EFSAM_S_ZIP, "r") as zf: zf.extractall("weights")
models["efficientsam_s"] = build_efficient_sam_vits().to(device).eval()
# ---------- Image tensor ----------
img_np = np.array(Image.open(IMAGE_PATH).convert("RGB"))
H, W = img_np.shape[:2]
img_tensor = transforms.ToTensor()(img_np).to(device) # [C,H,W]
(โฆ intermediate code unchanged, same as your original โฆ)
Through this process, you can clearly see the bbox and segmentation results!
With multiple models connected, you can carry out many exciting projects~ ๐
๐งฌ (ํ๊ตญ์ด) EfficientSAM ์ค์ต!!
์ค๋์ ์ง๋ํฌ์คํ
์์ ์ด๋ก ์ ๋ํ์ฌ ๊ณต๋ถํด๋ณด์๋!
๊ฐ๋ฒผ์ด Segmentation ๋ชจ๋ธ๋ธ! EfficientSAM
์ ์ค์ต์ ์งํํด๋ณด๊ฒ ์ต๋๋ค!!
โ๏ธ ๊ธฐ๋ณธ ์คํ ๋ชฉ์ :
- GPU๋ฅผ ํ์ฉํ EfficientSAM์ ์ฌ์ฉ
- Prompt ๊ธฐ๋ฐ Segmentation
- ์ด๋ฏธ์ง์ Box/Point๋ก Prompt๋ฅผ ์ค ์ ์์
๐ง 1. ์ค์น ๋ฐ ์ ์
EfficientSAM์ Hugging Face Transformers์ฒ๋ผ pip ํจํค์ง๊ฐ ์๋๋ฏ๋ก,
GitHub์์ ์ง์ ํด๋ก ํ์ฌ ์ค์นํฉ๋๋ค.
1
2
3
# 1. EfficientSAM ๋ ํฌ ํด๋ก
git clone https://github.com/ChaoningZhang/EfficientSAM.git
cd EfficientSAM
- ๊ทธ๋ผ!! ์น์ ํ๋ weights ํด๋์
efficient_sam_vits.pt.zip
๋ชจ๋ธ์ด ์ ์ ์ฅ๋์ด์์ต๋๋ค~
๐ผ๏ธ 2. ์ด๋ฏธ์ง ์ธ๊ทธ๋ฉํ ์ด์ ๊ธฐ๋ณธ ์ค์ต - CPU&GPU!
์ฐ์ git ์ ์ ์ฅ๋ EfficientSAM_example.py
๋ฅผ ์คํํด๋ณด๋ฉด ์ ์คํ์ด ๋๊ณ !
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam
from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
models = {}
# Build the EfficientSAM-Ti model.
models['efficientsam_ti'] = build_efficient_sam_vitt()
# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
zip_ref.extractall("weights")
# Build the EfficientSAM-S model.
models['efficientsam_s'] = build_efficient_sam_vits()
# Build the SqueezeSAM model.
# models['squeeze_sam'] = build_squeeze_sam()
# load an image
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np)
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]])
input_labels = torch.tensor([[[1, 1]]])
# Run inference for both EfficientSAM-Ti and EfficientSAM-S models.
for model_name, model in models.items():
print('Running inference using ', model_name)
predicted_logits, predicted_iou = model(
sample_image_tensor[None, ...],
input_points,
input_labels,
)
sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
predicted_logits = torch.take_along_dim(
predicted_logits, sorted_ids[..., None, None], dim=2
)
# The masks are already sorted by their predicted IOUs.
# The first dimension is the batch size (we have a single image. so it is 1).
# The second dimension is the number of masks we want to generate (in this case, it is only 1)
# The third dimension is the number of candidate masks output by the model.
# For this demo we use the first mask.
mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()
masked_image_np = sample_image_np.copy().astype(np.uint8) * mask[:,:,None]
Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")
# ํ์ธ์ฉ: ํ๋ผ๋ฏธํฐ ์ฅ์น
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)
์๋์ ๊ฐ์ด ๊ฐ์์ง๊ฐ segmentation ๋ ๊ฒ์ ํ์ธํ ์ ์์ต๋๋ค!!
๊ทธ๋ฐ๋ฐ!! Device๊ฐ CPU์ด๊ธฐ์~~
์๋์ ๊ฐ์ด ํด๋ณด๋ฉด~~ GPU ๋ก ๋์๊ฐ ๋ก๊ทธ ํ์ธ์ด ๊ฐ๋ฅํฉ๋๋ค!!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam
from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
import contextlib
# -----------------------------------
# Device ์ค์
# -----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# -----------------------------------
# ๋ชจ๋ธ ๋ก๋ (+ GPU ์ด๋)
# -----------------------------------
models = {}
# EfficientSAM-Ti
models['efficientsam_ti'] = build_efficient_sam_vitt().to(device).eval()
# EfficientSAM-S (zip์์ ๊ฐ์ค์น ์ถ์ถ ํ)
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
zip_ref.extractall("weights")
models['efficientsam_s'] = build_efficient_sam_vits().to(device).eval()
# SqueezeSAM (ํ์์)
# models['squeeze_sam'] = build_squeeze_sam().to(device).eval()
# -----------------------------------
# ์
๋ ฅ ์ค๋น (+ GPU ์ด๋)
# -----------------------------------
# load an image
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np).to(device) # [C,H,W] float32 on device
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device) # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device) # [B=1, N=1, K=2]
# -----------------------------------
# ์ถ๋ก (AMP๋ CUDA์ผ ๋๋ง)
# -----------------------------------
amp_ctx = torch.autocast(device_type="cuda") if device.type == "cuda" else contextlib.nullcontext()
for model_name, model in models.items():
print('Running inference using', model_name)
with torch.inference_mode(), amp_ctx:
predicted_logits, predicted_iou = model(
sample_image_tensor[None, ...], # [1,C,H,W]
input_points,
input_labels,
)
# ์ ๋ ฌ ๋ฐ ์์ mask ์ ํ
sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
predicted_logits = torch.take_along_dim(predicted_logits, sorted_ids[..., None, None], dim=2)
# ์ฒซ ๋ฒ์งธ ํ๋ณด mask ์ฌ์ฉ
mask = (predicted_logits[0, 0, 0] >= 0).to(torch.uint8).cpu().numpy() # [H,W], uint8 (0/1)
# ๋ง์คํน ํ ์ ์ฅ
masked_image_np = (sample_image_np.astype(np.uint8) * mask[:, :, None])
Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")
# ํ์ธ์ฉ: ํ๋ผ๋ฏธํฐ ์ฅ์น
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)
๊ฒฐ๊ณผ๋ก๊ทธ~!~
1
2
3
4
Running inference using efficientsam_ti
Running inference using efficientsam_s
Model param device: cuda:0
Image tensor device: cuda:0
๐งช 3. Prompt ๋ฅผ ๋ฐ๊ฟ๊ฐ๋ฉฐ ์คํํ๊ธฐ!!
EfficientSAM์ box ๋๋ point prompt ๊ธฐ๋ฐ์ผ๋ก ์ธ๊ทธ๋ฉํ
์ด์
์ ์ํํฉ๋๋ค.
๋จผ์ ๊ฐ์ฅ ๊ธฐ๋ณธ์ ์ธ point prompt ๋ ์ฌ์ค!! ์์ ์ํ์ฝ๋์์ ์์ ์์๋๋ค!!
๋ฐ๋ก ์๋ ๋ถ๋ถ์ธ๋ฐ์~~
1
2
3
4
5
...
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device) # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device) # [B=1, N=1, K=2]
...
์๋ฅผ ํด์ํด๋ณด๋ฉด 2๊ฐ์ positive ์ ์ผ๋ก ์งํํ ๊ฒ์ด๊ณ ๊ณ
- [580, 350], [650, 350] โ ์ ์ ๊ฐ ํด๋ฆญํ ์ ์ขํ.
- 1, 1 โ positive point (์ด ์์ญ์ ํฌํจ).
๋ง์ฝ ์ ์ label์ด 0์ด์๋ค๋ฉด negative point, ์ฆ ์์ญ์๋ฐ๊นฅ์ ์๋ฏธํ๋๊ฒ ์ ๋๋ค!!
๊ทธ๋ผ!! bbox๋!!?
1
2
3
4
5
6
...
box_pts = np.array([[x1, y1], [x2, y2]], dtype=np.int64)
box_lbl = np.array([2, 3], dtype=np.int64)
input_points = torch.from_numpy(box_pts)[None, None, ...].to(device) # [1,1,2,2]
input_labels = torch.from_numpy(box_lbl)[None, None, ...].to(device) # [1,1,2]
...
์์ ์ฝ๋ ํ์์ผ๋ก
- ๋ ์ ์ ์ฃผ๊ณ labels์ [2,3]๋ก ์ค์ ํ๋ฉด ๋ฐ์ค๋ก ํด์๋ฉ๋๋ค.
- ์ด๋, label 2 = ์ข์๋จ, label 3 = ์ฐํ๋จ ์ ์์ ์๋ฏธํ์ง์!!
โ๏ธ 4. ์ค์ ์์ฉ~!!! groundingDINO + EfficientSAM ์คํํด๋ณด๊ธฐ
- Open-Voca์ Object Detection ํด,
GroundingDINO
๊ธฐ์ต๋์์ฃ !? - Groundingdino ์ธํ ์ด ๋์ด์ผํ๋ ๊ถ๊ธํ์ ๋ถ์ ์์ ํฌ์คํ ์ ์ฐธ๊ณ ํด์ฃผ์ธ์!!
- ์ด
GroundingDINO
๋ฅผ ๋ฐํ์ผ๋ก bbox๋ฅผ ๋ง๋ค๊ณ , EfficientSAM
์ผ๋ก Segmenetation ํด๋ณด๊ฒ ์ต๋๋ค!!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# 1. ํจํค์ง ๋ฐ ๋ชจ๋ธ ๋ก๋ & ๋ณ์ ์ค์
import os, contextlib, zipfile, numpy as np, torch, cv2
from PIL import Image
from torchvision import transforms
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
from groundingdino.util.inference import load_model, load_image, predict
IMAGE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples/brush_with_toothbrush_000268.jpg"
TEXT_PROMPT = "toothbrush"
SAVE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples"
img_name = os.path.splitext(os.path.basename(IMAGE_PATH))[0]
GDINO_CONFIG = "{my_dir}/EfficientSAM_gdino/grounding_dino/config/GroundingDINO_SwinT_OGC.py"
GDINO_WEIGHTS = "{my_dir}/EfficientSAM_gdino/grounding_dino/weights/groundingdino_swint_ogc.pth"
EFSAM_S_ZIP = "{my_dir}/EfficientSAM_gdino/weights/efficient_sam_vits.pt.zip"
OUTPUT_DIR = os.path.join(SAVE_PATH, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[Device]", device)
# ---------- GroundingDINO ----------
gdino = load_model(GDINO_CONFIG, GDINO_WEIGHTS)
image_source, image_pre = load_image(IMAGE_PATH)
# ---------- EfficientSAM ----------
models = {}
models["efficientsam_ti"] = build_efficient_sam_vitt().to(device).eval()
if os.path.isfile(EFSAM_S_ZIP):
with zipfile.ZipFile(EFSAM_S_ZIP, "r") as zf: zf.extractall("weights")
models["efficientsam_s"] = build_efficient_sam_vits().to(device).eval()
# ---------- ์ด๋ฏธ์ง ํ
์ ----------
img_np = np.array(Image.open(IMAGE_PATH).convert("RGB"))
H, W = img_np.shape[:2]
img_tensor = transforms.ToTensor()(img_np).to(device) # [C,H,W]
# 2. GDINO๋ก bbox ์ป๊ธฐ
# ---------- GDINO๋ก bbox ์ป๊ธฐ ----------
boxes_norm, logits, phrases = predict(
model=gdino, image=image_pre, caption=TEXT_PROMPT,
box_threshold=0.35, text_threshold=0.25
)
if len(boxes_norm) == 0:
raise SystemExit("[GroundingDINO] no box")
top = int(torch.argmax(logits).item())
# boxes_norm[top] = (cx, cy, w, h) in [0,1]
cxcywh = boxes_norm[top].detach().cpu() # -> CPU tensor
# ํ
์ ์ฐ์ฐ์ผ๋ก x1y1x2y2 ๋ง๋ค๊ณ , round + int๋ก ๋ณํ
x1, y1, x2, y2 = torch.tensor([
(cxcywh[0] - cxcywh[2] / 2) * W,
(cxcywh[1] - cxcywh[3] / 2) * H,
(cxcywh[0] + cxcywh[2] / 2) * W,
(cxcywh[1] + cxcywh[3] / 2) * H,
]).round().to(torch.int64).tolist()
# ์ขํ ์ ๋ฆฌ/ํด๋จํ
x1, x2 = max(0, min(x1, x2)), min(W, max(x1, x2))
y1, y2 = max(0, min(y1, y2)), min(H, max(y1, y2))
print(f"[Box pixel] {x1},{y1} โ {x2},{y2}")
# 3. bbox prompt๋ก EfficientSam์ ๋ฃ๊ธฐ!!
---------- ์ฌ๊ธฐ์ '๋ฐ์ค ํ๋กฌํํธ'๋ก ๋ณํ ----------
# SAM/ESAM ๊ด๋ก: ๋ ์ ์ ์ฃผ๊ณ labels์ [2,3]๋ก ์ค์ ํ๋ฉด ๋ฐ์ค๋ก ํด์๋ฉ๋๋ค.
# label 2 = ์ข์๋จ, label 3 = ์ฐํ๋จ
box_pts = np.array([[x1, y1], [x2, y2]], dtype=np.int64)
box_lbl = np.array([2, 3], dtype=np.int64)
input_points = torch.from_numpy(box_pts)[None, None, ...].to(device) # [1,1,2,2]
input_labels = torch.from_numpy(box_lbl)[None, None, ...].to(device) # [1,1,2]
# ---------- ์ ํ: bbox/ํฌ์ธํธ ์๊ฐํ ----------
dbg = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR).copy()
cv2.rectangle(dbg, (x1,y1), (x2,y2), (0,255,0), 2)
cv2.circle(dbg, (x1,y1), 4, (255,0,0), -1) # label=2
cv2.circle(dbg, (x2,y2), 4, (0,0,255), -1) # label=3
cv2.imwrite(os.path.join(OUTPUT_DIR, f"{img_name}_bbox_prompt.jpg"), dbg)
# 4. ์ด๋ฏธ์ง ์๊ฐํ
def clip_mask_to_bbox(mask, x1, y1, x2, y2):
out = np.zeros_like(mask, dtype=mask.dtype)
out[y1:y2, x1:x2] = mask[y1:y2, x1:x2]
return out
def keep_largest_component(mask_uint8):
num, labels = cv2.connectedComponents(mask_uint8.astype(np.uint8))
if num <= 2: return mask_uint8
areas = [(labels == i).sum() for i in range(1, num)]
i_best = int(np.argmax(areas)) + 1
return (labels == i_best).astype(np.uint8)
amp_ctx = torch.amp.autocast("cuda") if device.type == "cuda" else contextlib.nullcontext()
for name, esam in models.items():
with torch.inference_mode(), amp_ctx:
logits, iou = esam(img_tensor[None, ...], input_points, input_labels) # box prompt
order = torch.argsort(iou, dim=-1, descending=True)
logits = torch.take_along_dim(logits, order[..., None, None], dim=2)
cand = (logits[0,0] >= 0).to(torch.uint8).cpu().numpy() # [K,H,W]
# ์์ ํ๋ณด ์ค์์ bbox ์์ ๊ฐ์ฅ ํฐ ์ฑ๋ถ ์ ํ(๋
ธ์ด์ฆ ์ต์ )
best = cand[0]
best = clip_mask_to_bbox(best, x1, y1, x2, y2)
best = keep_largest_component(best)
# ์ ์ฅ
masked = (img_np.astype(np.uint8) * best[:, :, None])
Image.fromarray(masked).save(os.path.join(OUTPUT_DIR, f"{img_name}_{name}_mask.png"))
overlay = img_np.copy()
overlay[best == 1] = (0.6 * overlay[best == 1] + 0.4 * np.array([0,255,0], np.uint8)).astype(np.uint8)
cv2.imwrite(os.path.join(OUTPUT_DIR, f"{img_name}_{name}_overlay.jpg"),
cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
print("done." , os.path.join(OUTPUT_DIR, f"{img_name}_{name}_overlay.jpg"))
์ ๊ณผ์ ์ ํตํด์ bbox ๋ฐ Segmentation ๊ฒฐ๊ณผ๋ฅผ ์ ๋ณผ์ ์์ต๋๋ค!!
์ฌ๋ฌ ๋ชจ๋ธ๊ณผ ์ฐ๊ฒฐ๋์ ์ฌ๋ฏธ์๋ ํ๋ก์ ํธ๋ค์ ํ ์ ์๊ฒ ๋ค์~^^