diff --git a/README.md b/README.md
index c7591b0..5ff7cf9 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
@@ -11,7 +11,7 @@
-
+
@@ -53,7 +53,7 @@
๐ Citation
๐ Paper Link |
๐ Arxiv Paper Link |
-
๐๏ธ Demo
+
๐๏ธ Demo
## 1. Introduction
@@ -69,6 +69,8 @@ Zhiyu Wu*, Xiaokang Chen*, Zizheng Pan*, Xingchao Liu*, Wen Liu**, Damai Dai, Hu

## 2. Release
+โ
2025-2-6: Naive Implemented Gradio Demo on Huggingface Space [deepseek-vl2-small](https://huggingface.co/spaces/deepseek-ai/deepseek-vl2-small).
+
โ
2024-12-25: Gradio Demo Example, Incremental Prefilling and VLMEvalKit Support.
โ
2024-12-13: DeepSeek-VL2 family released, including
DeepSeek-VL2-tiny
,
DeepSeek-VL2-small
,
DeepSeek-VL2
.
@@ -333,6 +335,9 @@ This is image_3:
<|Assistant|>: The first image contains carrots. The second image contains corn. The third image contains meat.<๏ฝendโofโsentence๏ฝ>
```
+Parse the bounding box coordinates, please refer to [parse_ref_bbox](https://github.com/deepseek-ai/DeepSeek-VL2/blob/main/deepseek_vl2/serve/app_modules/utils.py#L270-L298).
+
+
### Full Inference Example
```shell
# without incremental prefilling
diff --git a/deepseek_vl2/models/siglip_vit.py b/deepseek_vl2/models/siglip_vit.py
index 67f30e8..f06c25f 100644
--- a/deepseek_vl2/models/siglip_vit.py
+++ b/deepseek_vl2/models/siglip_vit.py
@@ -13,7 +13,6 @@
)
from timm.models._manipulate import named_apply, checkpoint_seq, adapt_input_conv
from transformers.modeling_utils import is_flash_attn_2_available
-from xformers.ops import memory_efficient_attention
from functools import partial
@@ -134,6 +133,8 @@ def __init__(
self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0. else nn.Identity()
def forward(self, x: torch.Tensor) -> torch.Tensor:
+ from xformers.ops import memory_efficient_attention
+
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
diff --git a/deepseek_vl2/serve/app_modules/utils.py b/deepseek_vl2/serve/app_modules/utils.py
index 9a9b98d..2683fdf 100755
--- a/deepseek_vl2/serve/app_modules/utils.py
+++ b/deepseek_vl2/serve/app_modules/utils.py
@@ -270,7 +270,7 @@ def pil_to_base64(
def parse_ref_bbox(response, image: Image.Image):
try:
image = image.copy()
- image_h, image_w = image.size
+ image_w, image_h = image.size
draw = ImageDraw.Draw(image)
ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
@@ -291,10 +291,10 @@ def parse_ref_bbox(response, image: Image.Image):
for indice, (box, label) in enumerate(zip(boxes, labels)):
box = (
- int(box[0] / 999 * image_h),
- int(box[1] / 999 * image_w),
- int(box[2] / 999 * image_h),
- int(box[3] / 999 * image_w),
+ int(box[0] / 999 * image_w),
+ int(box[1] / 999 * image_h),
+ int(box[2] / 999 * image_w),
+ int(box[3] / 999 * image_h),
)
box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
diff --git a/deepseek_vl2/serve/inference.py b/deepseek_vl2/serve/inference.py
index 9b445d6..4feaa97 100755
--- a/deepseek_vl2/serve/inference.py
+++ b/deepseek_vl2/serve/inference.py
@@ -22,7 +22,6 @@
import torch
import transformers
-from joblib.externals.cloudpickle import instance
from transformers import (
AutoModelForCausalLM,
StoppingCriteria,
diff --git a/images/badge.svg b/images/badge.svg
new file mode 100644
index 0000000..1551f56
--- /dev/null
+++ b/images/badge.svg
@@ -0,0 +1 @@
+
diff --git a/pyproject.toml b/pyproject.toml
index 5f63b6a..fd6c463 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
"torch==2.0.1",
"transformers==4.38.2",
"timm>=0.9.16",
+ "xformers>=0.0.21",
"accelerate",
"sentencepiece",
"attrdict",
diff --git a/requirements.txt b/requirements.txt
index c7acd26..b89b147 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
torch==2.0.1
transformers==4.38.2
+xformers>=0.0.21
timm>=0.9.16
accelerate
sentencepiece
diff --git a/web_demo.py b/web_demo.py
index 894aece..9210e68 100755
--- a/web_demo.py
+++ b/web_demo.py
@@ -118,8 +118,8 @@
[
[
"images/multi_image_1.jpeg",
- "images/mi_2.jpeg",
- "images/mi_3.jpeg"
+ "images/multi_image_2.jpeg",
+ "images/multi_image_3.jpeg"
],
"่ฝๅธฎๆ็จ่ฟๅ ไธช้ฃๆๅไธ้่ๅ?",
]