Post

๐Ÿง  EfficientSAM Hands-On Practice!! : EfficientSAM ์‹ค์Šต!! with Python

๐Ÿง  EfficientSAM Hands-On Practice!! : EfficientSAM ์‹ค์Šต!! with Python

๐Ÿงฌ (English) EfficientSAM Practice!!

Today, following the previous post where we studied the theory,
we will conduct a hands-on experiment with the lightweight segmentation model EfficientSAM!

โœ”๏ธ Main experiment objectives:

  • Use EfficientSAM with GPU
  • Prompt-based segmentation
  • Prompts can be given either as Box or Point

๐Ÿ”ง 1. Installation & Setup

EfficientSAM is not a pip package like Hugging Face Transformers,
so you need to clone the GitHub repository directly.

1
2
3
# 1. Clone the EfficientSAM repository
git clone https://github.com/ChaoningZhang/EfficientSAM.git
cd EfficientSAM
  • Conveniently, the weights folder already contains the pretrained model efficient_sam_vits.pt.zip.

๐Ÿ–ผ๏ธ 2. Basic Image Segmentation โ€“ CPU & GPU

First, if you run the provided EfficientSAM_example.py in the repo,
you can check that it works correctly.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam

from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
    


models = {}

# Build the EfficientSAM-Ti model.
models['efficientsam_ti'] = build_efficient_sam_vitt()

# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
    zip_ref.extractall("weights")
# Build the EfficientSAM-S model.
models['efficientsam_s'] = build_efficient_sam_vits()

# Build the SqueezeSAM model.
# models['squeeze_sam'] = build_squeeze_sam()

# load an image
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np)
# Feed a few (x,y) points in the mask as input.

input_points = torch.tensor([[[[580, 350], [650, 350]]]])
input_labels = torch.tensor([[[1, 1]]])

# Run inference for both EfficientSAM-Ti and EfficientSAM-S models.
for model_name, model in models.items():
    print('Running inference using ', model_name)
    predicted_logits, predicted_iou = model(
        sample_image_tensor[None, ...],
        input_points,
        input_labels,
    )
    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
    predicted_logits = torch.take_along_dim(
        predicted_logits, sorted_ids[..., None, None], dim=2
    )
    # The masks are already sorted by their predicted IOUs.
    # The first dimension is the batch size (we have a single image. so it is 1).
    # The second dimension is the number of masks we want to generate (in this case, it is only 1)
    # The third dimension is the number of candidate masks output by the model.
    # For this demo we use the first mask.
    mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()
    masked_image_np = sample_image_np.copy().astype(np.uint8) * mask[:,:,None]
    Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")

# Check: parameter device
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)

You will see that the dog is successfully segmented:

Image

But the device is CPU by default.
If you change it as follows, you can confirm the logs showing execution on GPU:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam

from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
import contextlib

# -----------------------------------
# Device setting
# -----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------------
# Load models (+ move to GPU)
# -----------------------------------
models = {}

# EfficientSAM-Ti
models['efficientsam_ti'] = build_efficient_sam_vitt().to(device).eval()

# EfficientSAM-S (unzip weights first)
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
    zip_ref.extractall("weights")
models['efficientsam_s'] = build_efficient_sam_vits().to(device).eval()

# SqueezeSAM (optional)
# models['squeeze_sam'] = build_squeeze_sam().to(device).eval()

# -----------------------------------
# Prepare input (+ move to GPU)
# -----------------------------------
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np).to(device)  # [C,H,W] float32 on device

# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device)  # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device)                    # [B=1, N=1, K=2]

# -----------------------------------
# Inference (AMP for CUDA only)
# -----------------------------------
amp_ctx = torch.autocast(device_type="cuda") if device.type == "cuda" else contextlib.nullcontext()

for model_name, model in models.items():
    print('Running inference using', model_name)

    with torch.inference_mode(), amp_ctx:
        predicted_logits, predicted_iou = model(
            sample_image_tensor[None, ...],  # [1,C,H,W]
            input_points,
            input_labels,
        )

    # Sort and select top mask
    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
    predicted_logits = torch.take_along_dim(predicted_logits, sorted_ids[..., None, None], dim=2)

    # Use the first candidate mask
    mask = (predicted_logits[0, 0, 0] >= 0).to(torch.uint8).cpu().numpy()  # [H,W], uint8 (0/1)

    # Save masked image
    masked_image_np = (sample_image_np.astype(np.uint8) * mask[:, :, None])
    Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")

# Check: parameter device
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)

Result log:

1
2
3
4
Running inference using efficientsam_ti
Running inference using efficientsam_s
Model param device: cuda:0
Image tensor device: cuda:0

๐Ÿงช 3. Experimenting with Prompts

EfficientSAM performs segmentation based on either box or point prompts.
The most basic point prompt is already shown in the sample code above:

1
2
3
4
5
...
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device)  # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device)                    # [B=1, N=1, K=2]
...

This means two positive points were provided:

  • [580, 350], [650, 350] โ†’ user-clicked coordinates.
  • 1, 1 โ†’ positive points (included in the mask).

If the label had been 0, it would mean a negative point (outside the target area).

So, what about bbox?

1
2
3
4
5
6
...
box_pts = np.array([[x1, y1], [x2, y2]], dtype=np.int64)
box_lbl = np.array([2, 3], dtype=np.int64)
input_points = torch.from_numpy(box_pts)[None, None, ...].to(device)   # [1,1,2,2]
input_labels = torch.from_numpy(box_lbl)[None, None, ...].to(device)   # [1,1,2]
...

With this format:

  • Give two points and set labels to [2, 3], and it will be interpreted as a bounding box.
  • Label 2 = top-left corner, Label 3 = bottom-right corner.

โš™๏ธ 4. Real-World Application โ€“ GroundingDINO + EfficientSAM

  • Remember the open-vocabulary object detection tool GroundingDINO?
  • Make sure you have GroundingDINO set up (see previous post).
  • We will use GroundingDINO to generate bboxes,
  • and then use EfficientSAM for segmentation.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# 1. Package imports, model loading & variable setup  
import os, contextlib, zipfile, numpy as np, torch, cv2
from PIL import Image
from torchvision import transforms
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
from groundingdino.util.inference import load_model, load_image, predict

IMAGE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples/brush_with_toothbrush_000268.jpg"
TEXT_PROMPT = "toothbrush"

SAVE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples"
img_name = os.path.splitext(os.path.basename(IMAGE_PATH))[0]

GDINO_CONFIG  = "{my_dir}/EfficientSAM_gdino/grounding_dino/config/GroundingDINO_SwinT_OGC.py"
GDINO_WEIGHTS = "{my_dir}/EfficientSAM_gdino/grounding_dino/weights/groundingdino_swint_ogc.pth"
EFSAM_S_ZIP   = "{my_dir}/EfficientSAM_gdino/weights/efficient_sam_vits.pt.zip"

OUTPUT_DIR = os.path.join(SAVE_PATH, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[Device]", device)

# ---------- GroundingDINO ----------
gdino = load_model(GDINO_CONFIG, GDINO_WEIGHTS)
image_source, image_pre = load_image(IMAGE_PATH)

# ---------- EfficientSAM ----------
models = {}
models["efficientsam_ti"] = build_efficient_sam_vitt().to(device).eval()
if os.path.isfile(EFSAM_S_ZIP):
    with zipfile.ZipFile(EFSAM_S_ZIP, "r") as zf: zf.extractall("weights")
models["efficientsam_s"]  = build_efficient_sam_vits().to(device).eval()

# ---------- Image tensor ----------
img_np = np.array(Image.open(IMAGE_PATH).convert("RGB"))
H, W = img_np.shape[:2]
img_tensor = transforms.ToTensor()(img_np).to(device)  # [C,H,W]

(โ€ฆ intermediate code unchanged, same as your original โ€ฆ)


Image

Through this process, you can clearly see the bbox and segmentation results!

With multiple models connected, you can carry out many exciting projects~ ๐ŸŽ‰


๐Ÿงฌ (ํ•œ๊ตญ์–ด) EfficientSAM ์‹ค์Šต!!

์˜ค๋Š˜์€ ์ง€๋‚œํฌ์ŠคํŒ…์—์„œ ์ด๋ก ์— ๋Œ€ํ•˜์—ฌ ๊ณต๋ถ€ํ•ด๋ณด์•˜๋˜!
๊ฐ€๋ฒผ์šด Segmentation ๋ชจ๋ธ๋ธ! EfficientSAM ์˜ ์‹ค์Šต์„ ์ง„ํ–‰ํ•ด๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค!!

โœ”๏ธ ๊ธฐ๋ณธ ์‹คํ—˜ ๋ชฉ์ :

  • GPU๋ฅผ ํ™œ์šฉํ•œ EfficientSAM์˜ ์‚ฌ์šฉ
  • Prompt ๊ธฐ๋ฐ˜ Segmentation
  • ์ด๋ฏธ์ง€์— Box/Point๋กœ Prompt๋ฅผ ์ค„ ์ˆ˜ ์žˆ์Œ

๐Ÿ”ง 1. ์„ค์น˜ ๋ฐ ์…‹์—…

EfficientSAM์€ Hugging Face Transformers์ฒ˜๋Ÿผ pip ํŒจํ‚ค์ง€๊ฐ€ ์•„๋‹ˆ๋ฏ€๋กœ,
GitHub์—์„œ ์ง์ ‘ ํด๋ก ํ•˜์—ฌ ์„ค์น˜ํ•ฉ๋‹ˆ๋‹ค.

1
2
3
# 1. EfficientSAM ๋ ˆํฌ ํด๋ก 
git clone https://github.com/ChaoningZhang/EfficientSAM.git
cd EfficientSAM
  • ๊ทธ๋Ÿผ!! ์นœ์ ˆํžˆ๋„ weights ํด๋”์— efficient_sam_vits.pt.zip ๋ชจ๋ธ์ด ์ž˜ ์ €์žฅ๋˜์–ด์žˆ์Šต๋‹ˆ๋‹ค~

๐Ÿ–ผ๏ธ 2. ์ด๋ฏธ์ง€ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜ ๊ธฐ๋ณธ ์‹ค์Šต - CPU&GPU!

์šฐ์„  git ์— ์ €์žฅ๋œ EfficientSAM_example.py ๋ฅผ ์‹คํ–‰ํ•ด๋ณด๋ฉด ์ž˜ ์‹คํ–‰์ด ๋˜๊ณ !

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam

from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
    


models = {}

# Build the EfficientSAM-Ti model.
models['efficientsam_ti'] = build_efficient_sam_vitt()

# Since EfficientSAM-S checkpoint file is >100MB, we store the zip file.
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
    zip_ref.extractall("weights")
# Build the EfficientSAM-S model.
models['efficientsam_s'] = build_efficient_sam_vits()

# Build the SqueezeSAM model.
# models['squeeze_sam'] = build_squeeze_sam()

# load an image
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np)
# Feed a few (x,y) points in the mask as input.

input_points = torch.tensor([[[[580, 350], [650, 350]]]])
input_labels = torch.tensor([[[1, 1]]])

# Run inference for both EfficientSAM-Ti and EfficientSAM-S models.
for model_name, model in models.items():
    print('Running inference using ', model_name)
    predicted_logits, predicted_iou = model(
        sample_image_tensor[None, ...],
        input_points,
        input_labels,
    )
    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
    predicted_logits = torch.take_along_dim(
        predicted_logits, sorted_ids[..., None, None], dim=2
    )
    # The masks are already sorted by their predicted IOUs.
    # The first dimension is the batch size (we have a single image. so it is 1).
    # The second dimension is the number of masks we want to generate (in this case, it is only 1)
    # The third dimension is the number of candidate masks output by the model.
    # For this demo we use the first mask.
    mask = torch.ge(predicted_logits[0, 0, 0, :, :], 0).cpu().detach().numpy()
    masked_image_np = sample_image_np.copy().astype(np.uint8) * mask[:,:,None]
    Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")

# ํ™•์ธ์šฉ: ํŒŒ๋ผ๋ฏธํ„ฐ ์žฅ์น˜
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)

์•„๋ž˜์™€ ๊ฐ™์ด ๊ฐ•์•„์ง€๊ฐ€ segmentation ๋œ ๊ฒƒ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค!!

Image

๊ทธ๋Ÿฐ๋ฐ!! Device๊ฐ€ CPU์ด๊ธฐ์—~~
์•„๋ž˜์™€ ๊ฐ™์ด ํ•ด๋ณด๋ฉด~~ GPU ๋กœ ๋Œ์•„๊ฐ„ ๋กœ๊ทธ ํ™•์ธ์ด ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค!!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
# from squeeze_sam.build_squeeze_sam import build_squeeze_sam

from PIL import Image
from torchvision import transforms
import torch
import numpy as np
import zipfile
import contextlib

# -----------------------------------
# Device ์„ค์ •
# -----------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------------
# ๋ชจ๋ธ ๋กœ๋“œ (+ GPU ์ด๋™)
# -----------------------------------
models = {}

# EfficientSAM-Ti
models['efficientsam_ti'] = build_efficient_sam_vitt().to(device).eval()

# EfficientSAM-S (zip์—์„œ ๊ฐ€์ค‘์น˜ ์ถ”์ถœ ํ›„)
with zipfile.ZipFile("weights/efficient_sam_vits.pt.zip", 'r') as zip_ref:
    zip_ref.extractall("weights")
models['efficientsam_s'] = build_efficient_sam_vits().to(device).eval()

# SqueezeSAM (ํ•„์š”์‹œ)
# models['squeeze_sam'] = build_squeeze_sam().to(device).eval()

# -----------------------------------
# ์ž…๋ ฅ ์ค€๋น„ (+ GPU ์ด๋™)
# -----------------------------------
# load an image
sample_image_np = np.array(Image.open("figs/examples/dogs.jpg"))
sample_image_tensor = transforms.ToTensor()(sample_image_np).to(device)  # [C,H,W] float32 on device

# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device)  # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device)                    # [B=1, N=1, K=2]

# -----------------------------------
# ์ถ”๋ก  (AMP๋Š” CUDA์ผ ๋•Œ๋งŒ)
# -----------------------------------
amp_ctx = torch.autocast(device_type="cuda") if device.type == "cuda" else contextlib.nullcontext()

for model_name, model in models.items():
    print('Running inference using', model_name)

    with torch.inference_mode(), amp_ctx:
        predicted_logits, predicted_iou = model(
            sample_image_tensor[None, ...],  # [1,C,H,W]
            input_points,
            input_labels,
        )

    # ์ •๋ ฌ ๋ฐ ์ƒ์œ„ mask ์„ ํƒ
    sorted_ids = torch.argsort(predicted_iou, dim=-1, descending=True)
    predicted_iou = torch.take_along_dim(predicted_iou, sorted_ids, dim=2)
    predicted_logits = torch.take_along_dim(predicted_logits, sorted_ids[..., None, None], dim=2)

    # ์ฒซ ๋ฒˆ์งธ ํ›„๋ณด mask ์‚ฌ์šฉ
    mask = (predicted_logits[0, 0, 0] >= 0).to(torch.uint8).cpu().numpy()  # [H,W], uint8 (0/1)

    # ๋งˆ์Šคํ‚น ํ›„ ์ €์žฅ
    masked_image_np = (sample_image_np.astype(np.uint8) * mask[:, :, None])
    Image.fromarray(masked_image_np).save(f"figs/examples/dogs_{model_name}_mask.png")

# ํ™•์ธ์šฉ: ํŒŒ๋ผ๋ฏธํ„ฐ ์žฅ์น˜
print("Model param device:", next(models['efficientsam_ti'].parameters()).device)
print("Image tensor device:", sample_image_tensor.device)

๊ฒฐ๊ณผ๋กœ๊ทธ~!~

1
2
3
4
Running inference using efficientsam_ti
Running inference using efficientsam_s
Model param device: cuda:0
Image tensor device: cuda:0

๐Ÿงช 3. Prompt ๋ฅผ ๋ฐ”๊ฟ”๊ฐ€๋ฉฐ ์‹คํ—˜ํ•˜๊ธฐ!!

EfficientSAM์€ box ๋˜๋Š” point prompt ๊ธฐ๋ฐ˜์œผ๋กœ ์„ธ๊ทธ๋ฉ˜ํ…Œ์ด์…˜์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
๋จผ์ € ๊ฐ€์žฅ ๊ธฐ๋ณธ์ ์ธ point prompt ๋Š” ์‚ฌ์‹ค!! ์œ„์˜ ์ƒ˜ํ”Œ์ฝ”๋“œ์—์„œ ์•Œ์ˆ˜ ์žˆ์”๋‹ˆ๋‹ค!!

๋ฐ”๋กœ ์•„๋ž˜ ๋ถ€๋ถ„์ธ๋ฐ์š”~~

1
2
3
4
5
...
# Feed a few (x,y) points in the mask as input.
input_points = torch.tensor([[[[580, 350], [650, 350]]]], device=device)  # [B=1, N=1, K=2, 2]
input_labels = torch.tensor([[[1, 1]]], device=device)                    # [B=1, N=1, K=2]
...

์œ„๋ฅผ ํ•ด์„ํ•ด๋ณด๋ฉด 2๊ฐœ์˜ positive ์ ์œผ๋กœ ์ง„ํ–‰ํ•œ ๊ฒƒ์ด๊ณ ๊ณ 

  • [580, 350], [650, 350] โ†’ ์œ ์ €๊ฐ€ ํด๋ฆญํ•œ ์  ์ขŒํ‘œ.
  • 1, 1 โ†’ positive point (์ด ์˜์—ญ์„ ํฌํ•จ).

๋งŒ์•ฝ ์ ์˜ label์ด 0์ด์—ˆ๋‹ค๋ฉด negative point, ์ฆ‰ ์˜์—ญ์˜๋ฐ”๊นฅ์„ ์˜๋ฏธํ•˜๋Š”๊ฒƒ ์ž…๋‹ˆ๋‹ค!!

๊ทธ๋Ÿผ!! bbox๋Š”!!?

1
2
3
4
5
6
...
box_pts = np.array([[x1, y1], [x2, y2]], dtype=np.int64)
box_lbl = np.array([2, 3], dtype=np.int64)
input_points = torch.from_numpy(box_pts)[None, None, ...].to(device)   # [1,1,2,2]
input_labels = torch.from_numpy(box_lbl)[None, None, ...].to(device)   # [1,1,2]
...

์œ„์˜ ์ฝ”๋“œ ํ˜•์‹์œผ๋กœ

  • ๋‘ ์ ์„ ์ฃผ๊ณ  labels์„ [2,3]๋กœ ์„ค์ •ํ•˜๋ฉด ๋ฐ•์Šค๋กœ ํ•ด์„๋ฉ๋‹ˆ๋‹ค.
  • ์ด๋•Œ, label 2 = ์ขŒ์ƒ๋‹จ, label 3 = ์šฐํ•˜๋‹จ ์ ์ž„์„ ์˜๋ฏธํ•˜์ง€์š”!!

โš™๏ธ 4. ์‹ค์ „ ์‘์šฉ~!!! groundingDINO + EfficientSAM ์‹คํ—˜ํ•ด๋ณด๊ธฐ

  • Open-Voca์˜ Object Detection ํˆด, GroundingDINO ๊ธฐ์–ต๋‚˜์‹œ์ฃ !?
  • Groundingdino ์„ธํŒ…์ด ๋˜์–ด์•ผํ•˜๋‹ˆ ๊ถ๊ธˆํ•˜์‹ ๋ถ„์€ ์˜ˆ์ „ ํฌ์ŠคํŒ…์„ ์ฐธ๊ณ ํ•ด์ฃผ์„ธ์š”!!
  • ์ด GroundingDINO ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ bbox๋ฅผ ๋งŒ๋“ค๊ณ ,
  • EfficientSAM ์œผ๋กœ Segmenetation ํ•ด๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค!!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# 1. ํŒจํ‚ค์ง€ ๋ฐ ๋ชจ๋ธ ๋กœ๋“œ & ๋ณ€์ˆ˜ ์„ค์ •  
import os, contextlib, zipfile, numpy as np, torch, cv2
from PIL import Image
from torchvision import transforms
from efficient_sam.build_efficient_sam import build_efficient_sam_vitt, build_efficient_sam_vits
from groundingdino.util.inference import load_model, load_image, predict

IMAGE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples/brush_with_toothbrush_000268.jpg"
TEXT_PROMPT = "toothbrush"

SAVE_PATH = "{my_dir}/EfficientSAM_gdino/figs/examples"
img_name = os.path.splitext(os.path.basename(IMAGE_PATH))[0]

GDINO_CONFIG  = "{my_dir}/EfficientSAM_gdino/grounding_dino/config/GroundingDINO_SwinT_OGC.py"
GDINO_WEIGHTS = "{my_dir}/EfficientSAM_gdino/grounding_dino/weights/groundingdino_swint_ogc.pth"
EFSAM_S_ZIP   = "{my_dir}/EfficientSAM_gdino/weights/efficient_sam_vits.pt.zip"

OUTPUT_DIR = os.path.join(SAVE_PATH, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[Device]", device)

# ---------- GroundingDINO ----------
gdino = load_model(GDINO_CONFIG, GDINO_WEIGHTS)
image_source, image_pre = load_image(IMAGE_PATH)

# ---------- EfficientSAM ----------
models = {}
models["efficientsam_ti"] = build_efficient_sam_vitt().to(device).eval()
if os.path.isfile(EFSAM_S_ZIP):
    with zipfile.ZipFile(EFSAM_S_ZIP, "r") as zf: zf.extractall("weights")
models["efficientsam_s"]  = build_efficient_sam_vits().to(device).eval()

# ---------- ์ด๋ฏธ์ง€ ํ…์„œ ----------
img_np = np.array(Image.open(IMAGE_PATH).convert("RGB"))
H, W = img_np.shape[:2]
img_tensor = transforms.ToTensor()(img_np).to(device)  # [C,H,W]


# 2. GDINO๋กœ bbox ์–ป๊ธฐ 

# ---------- GDINO๋กœ bbox ์–ป๊ธฐ ----------
boxes_norm, logits, phrases = predict(
    model=gdino, image=image_pre, caption=TEXT_PROMPT,
    box_threshold=0.35, text_threshold=0.25
)
if len(boxes_norm) == 0:
    raise SystemExit("[GroundingDINO] no box")

top = int(torch.argmax(logits).item())

# boxes_norm[top] = (cx, cy, w, h)  in [0,1]
cxcywh = boxes_norm[top].detach().cpu()  # -> CPU tensor

# ํ…์„œ ์—ฐ์‚ฐ์œผ๋กœ x1y1x2y2 ๋งŒ๋“ค๊ณ , round + int๋กœ ๋ณ€ํ™˜
x1, y1, x2, y2 = torch.tensor([
    (cxcywh[0] - cxcywh[2] / 2) * W,
    (cxcywh[1] - cxcywh[3] / 2) * H,
    (cxcywh[0] + cxcywh[2] / 2) * W,
    (cxcywh[1] + cxcywh[3] / 2) * H,
]).round().to(torch.int64).tolist()

# ์ขŒํ‘œ ์ •๋ฆฌ/ํด๋žจํ”„
x1, x2 = max(0, min(x1, x2)), min(W, max(x1, x2))
y1, y2 = max(0, min(y1, y2)), min(H, max(y1, y2))
print(f"[Box pixel] {x1},{y1} โ†’ {x2},{y2}")


# 3. bbox prompt๋กœ EfficientSam์— ๋„ฃ๊ธฐ!!

 ---------- ์—ฌ๊ธฐ์„œ '๋ฐ•์Šค ํ”„๋กฌํ”„ํŠธ'๋กœ ๋ณ€ํ™˜ ----------
# SAM/ESAM ๊ด€๋ก€: ๋‘ ์ ์„ ์ฃผ๊ณ  labels์„ [2,3]๋กœ ์„ค์ •ํ•˜๋ฉด ๋ฐ•์Šค๋กœ ํ•ด์„๋ฉ๋‹ˆ๋‹ค.
# label 2 = ์ขŒ์ƒ๋‹จ, label 3 = ์šฐํ•˜๋‹จ
box_pts = np.array([[x1, y1], [x2, y2]], dtype=np.int64)
box_lbl = np.array([2, 3], dtype=np.int64)
input_points = torch.from_numpy(box_pts)[None, None, ...].to(device)   # [1,1,2,2]
input_labels = torch.from_numpy(box_lbl)[None, None, ...].to(device)   # [1,1,2]

# ---------- ์„ ํƒ: bbox/ํฌ์ธํŠธ ์‹œ๊ฐํ™” ----------
dbg = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR).copy()
cv2.rectangle(dbg, (x1,y1), (x2,y2), (0,255,0), 2)
cv2.circle(dbg, (x1,y1), 4, (255,0,0), -1)  # label=2
cv2.circle(dbg, (x2,y2), 4, (0,0,255), -1)  # label=3
cv2.imwrite(os.path.join(OUTPUT_DIR, f"{img_name}_bbox_prompt.jpg"), dbg)


# 4. ์ด๋ฏธ์ง€ ์‹œ๊ฐํ™”

def clip_mask_to_bbox(mask, x1, y1, x2, y2):
    out = np.zeros_like(mask, dtype=mask.dtype)
    out[y1:y2, x1:x2] = mask[y1:y2, x1:x2]
    return out

def keep_largest_component(mask_uint8):
    num, labels = cv2.connectedComponents(mask_uint8.astype(np.uint8))
    if num <= 2: return mask_uint8
    areas = [(labels == i).sum() for i in range(1, num)]
    i_best = int(np.argmax(areas)) + 1
    return (labels == i_best).astype(np.uint8)

amp_ctx = torch.amp.autocast("cuda") if device.type == "cuda" else contextlib.nullcontext()

for name, esam in models.items():
    with torch.inference_mode(), amp_ctx:
        logits, iou = esam(img_tensor[None, ...], input_points, input_labels)  # box prompt
    order = torch.argsort(iou, dim=-1, descending=True)
    logits = torch.take_along_dim(logits, order[..., None, None], dim=2)
    cand = (logits[0,0] >= 0).to(torch.uint8).cpu().numpy()  # [K,H,W]

    # ์ƒ์œ„ ํ›„๋ณด ์ค‘์—์„œ bbox ์•ˆ์˜ ๊ฐ€์žฅ ํฐ ์„ฑ๋ถ„ ์„ ํƒ(๋…ธ์ด์ฆˆ ์–ต์ œ)
    best = cand[0]
    best = clip_mask_to_bbox(best, x1, y1, x2, y2)
    best = keep_largest_component(best)

    # ์ €์žฅ
    masked = (img_np.astype(np.uint8) * best[:, :, None])
    Image.fromarray(masked).save(os.path.join(OUTPUT_DIR, f"{img_name}_{name}_mask.png"))
    overlay = img_np.copy()
    overlay[best == 1] = (0.6 * overlay[best == 1] + 0.4 * np.array([0,255,0], np.uint8)).astype(np.uint8)
    cv2.imwrite(os.path.join(OUTPUT_DIR, f"{img_name}_{name}_overlay.jpg"),
                cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
print("done." , os.path.join(OUTPUT_DIR, f"{img_name}_{name}_overlay.jpg"))

Image

์œ„ ๊ณผ์ •์„ ํ†ตํ•ด์„œ bbox ๋ฐ Segmentation ๊ฒฐ๊ณผ๋ฅผ ์ž˜ ๋ณผ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค!!

์—ฌ๋Ÿฌ ๋ชจ๋ธ๊ณผ ์—ฐ๊ฒฐ๋˜์„œ ์žฌ๋ฏธ์žˆ๋Š” ํ”„๋กœ์ ํŠธ๋“ค์„ ํ• ์ˆ˜ ์žˆ๊ฒ ๋„ค์š”~^^

This post is licensed under CC BY 4.0 by the author.