From 0dae14349354565197169db9ac7e29fe80e780c4 Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Tue, 4 Feb 2025 19:11:34 +0800 Subject: [PATCH 01/11] Update inference.py --- deepseek_vl2/serve/inference.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepseek_vl2/serve/inference.py b/deepseek_vl2/serve/inference.py index 9b445d6..4feaa97 100755 --- a/deepseek_vl2/serve/inference.py +++ b/deepseek_vl2/serve/inference.py @@ -22,7 +22,6 @@ import torch import transformers -from joblib.externals.cloudpickle import instance from transformers import ( AutoModelForCausalLM, StoppingCriteria, From c10e7eba367352307a69aefa60f848f69af346ed Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Tue, 4 Feb 2025 19:31:10 +0800 Subject: [PATCH 02/11] update requirements.txt --- pyproject.toml | 1 + requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5f63b6a..fd6c463 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "torch==2.0.1", "transformers==4.38.2", "timm>=0.9.16", + "xformers>=0.0.21", "accelerate", "sentencepiece", "attrdict", diff --git a/requirements.txt b/requirements.txt index c7acd26..b89b147 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ torch==2.0.1 transformers==4.38.2 +xformers>=0.0.21 timm>=0.9.16 accelerate sentencepiece From 0b05daf3d26c0cbf602ba857a1b36f8b0dfe6e94 Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Wed, 5 Feb 2025 19:23:13 +0800 Subject: [PATCH 03/11] Update web_demo.py --- web_demo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web_demo.py b/web_demo.py index 894aece..9210e68 100755 --- a/web_demo.py +++ b/web_demo.py @@ -118,8 +118,8 @@ [ [ "images/multi_image_1.jpeg", - "images/mi_2.jpeg", - "images/mi_3.jpeg" + "images/multi_image_2.jpeg", + "images/multi_image_3.jpeg" ], "能帮我用这几个食材做一道菜吗?", ] From 6eedfe3877b3f95c7fac457ab9ed7adeb90c7298 Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Wed, 5 Feb 2025 19:50:10 +0800 Subject: [PATCH 04/11] add huggingface space demo --- README.md | 4 ++-- images/badge.svg | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 images/badge.svg diff --git a/README.md b/README.md index c7591b0..2c6cab9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
- DeepSeek LLM + DeepSeek AI

@@ -11,7 +11,7 @@ Homepage - + Chat diff --git a/images/badge.svg b/images/badge.svg new file mode 100644 index 0000000..1551f56 --- /dev/null +++ b/images/badge.svg @@ -0,0 +1 @@ +DeepSeek: HomepageDeepSeekHomepage From a667810e3c509659adf39a2aa5ed05711d08e300 Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Thu, 6 Feb 2025 00:02:05 +0800 Subject: [PATCH 05/11] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2c6cab9..440ed7e 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,8 @@ Zhiyu Wu*, Xiaokang Chen*, Zizheng Pan*, Xingchao Liu*, Wen Liu**, Damai Dai, Hu ![](./images/vl2_teaser.jpeg) ## 2. Release +✅ 2025-2-6: Naive Gradio Demo on Huggingface Space [deepseek-vl2-small](https://huggingface.co/spaces/deepseek-ai/deepseek-vl2-small). + ✅ 2024-12-25: Gradio Demo Example, Incremental Prefilling and VLMEvalKit Support. ✅ 2024-12-13: DeepSeek-VL2 family released, including DeepSeek-VL2-tiny, DeepSeek-VL2-small, DeepSeek-VL2. From a698c36a4cfbd3ca700e79e936f7bc16b6fd5957 Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Thu, 6 Feb 2025 00:04:05 +0800 Subject: [PATCH 06/11] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 440ed7e..15f4f21 100644 --- a/README.md +++ b/README.md @@ -69,7 +69,7 @@ Zhiyu Wu*, Xiaokang Chen*, Zizheng Pan*, Xingchao Liu*, Wen Liu**, Damai Dai, Hu ![](./images/vl2_teaser.jpeg) ## 2. Release -✅ 2025-2-6: Naive Gradio Demo on Huggingface Space [deepseek-vl2-small](https://huggingface.co/spaces/deepseek-ai/deepseek-vl2-small). +✅ 2025-2-6: Naive Implemented Gradio Demo on Huggingface Space [deepseek-vl2-small](https://huggingface.co/spaces/deepseek-ai/deepseek-vl2-small). ✅ 2024-12-25: Gradio Demo Example, Incremental Prefilling and VLMEvalKit Support. From 32d92585ede32f6b627f8d6953b7dbe311219bf6 Mon Sep 17 00:00:00 2001 From: Zizheng Pan Date: Sun, 9 Feb 2025 18:01:06 +0800 Subject: [PATCH 07/11] Update demo link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 15f4f21..aff4393 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ 📖 Citation
📄 Paper Link | 📄 Arxiv Paper Link | - 👁️ Demo + 👁️ Demo

## 1. Introduction From cf21982aa5143e9606f2071440abe906a2c45419 Mon Sep 17 00:00:00 2001 From: zxy Date: Wed, 19 Feb 2025 13:46:33 +0800 Subject: [PATCH 08/11] optimize lib dependencies --- deepseek_vl2/models/modeling_deepseek_vl_v2.py | 1 - deepseek_vl2/models/siglip_vit.py | 3 ++- requirements.txt | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/deepseek_vl2/models/modeling_deepseek_vl_v2.py b/deepseek_vl2/models/modeling_deepseek_vl_v2.py index 957464f..97288fd 100644 --- a/deepseek_vl2/models/modeling_deepseek_vl_v2.py +++ b/deepseek_vl2/models/modeling_deepseek_vl_v2.py @@ -1,4 +1,3 @@ -from attrdict import AttrDict from dataclasses import dataclass import logging import gc diff --git a/deepseek_vl2/models/siglip_vit.py b/deepseek_vl2/models/siglip_vit.py index 67f30e8..f06c25f 100644 --- a/deepseek_vl2/models/siglip_vit.py +++ b/deepseek_vl2/models/siglip_vit.py @@ -13,7 +13,6 @@ ) from timm.models._manipulate import named_apply, checkpoint_seq, adapt_input_conv from transformers.modeling_utils import is_flash_attn_2_available -from xformers.ops import memory_efficient_attention from functools import partial @@ -134,6 +133,8 @@ def __init__( self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0. else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: + from xformers.ops import memory_efficient_attention + B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim) diff --git a/requirements.txt b/requirements.txt index b89b147..02d0661 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ xformers>=0.0.21 timm>=0.9.16 accelerate sentencepiece -attrdict einops # for gradio demo From b6e557c73e284014e835097478c69bface0a0bde Mon Sep 17 00:00:00 2001 From: zxy Date: Wed, 19 Feb 2025 19:30:58 +0800 Subject: [PATCH 09/11] recover attrdict --- deepseek_vl2/models/modeling_deepseek_vl_v2.py | 1 + requirements.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/deepseek_vl2/models/modeling_deepseek_vl_v2.py b/deepseek_vl2/models/modeling_deepseek_vl_v2.py index 97288fd..957464f 100644 --- a/deepseek_vl2/models/modeling_deepseek_vl_v2.py +++ b/deepseek_vl2/models/modeling_deepseek_vl_v2.py @@ -1,3 +1,4 @@ +from attrdict import AttrDict from dataclasses import dataclass import logging import gc diff --git a/requirements.txt b/requirements.txt index 02d0661..b89b147 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ xformers>=0.0.21 timm>=0.9.16 accelerate sentencepiece +attrdict einops # for gradio demo From 699e00ad0df012f955b3858b76dc11ae36fbc57c Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Wed, 26 Feb 2025 13:00:15 +0800 Subject: [PATCH 10/11] Update utils.py --- deepseek_vl2/serve/app_modules/utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/deepseek_vl2/serve/app_modules/utils.py b/deepseek_vl2/serve/app_modules/utils.py index 9a9b98d..2683fdf 100755 --- a/deepseek_vl2/serve/app_modules/utils.py +++ b/deepseek_vl2/serve/app_modules/utils.py @@ -270,7 +270,7 @@ def pil_to_base64( def parse_ref_bbox(response, image: Image.Image): try: image = image.copy() - image_h, image_w = image.size + image_w, image_h = image.size draw = ImageDraw.Draw(image) ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response) @@ -291,10 +291,10 @@ def parse_ref_bbox(response, image: Image.Image): for indice, (box, label) in enumerate(zip(boxes, labels)): box = ( - int(box[0] / 999 * image_h), - int(box[1] / 999 * image_w), - int(box[2] / 999 * image_h), - int(box[3] / 999 * image_w), + int(box[0] / 999 * image_w), + int(box[1] / 999 * image_h), + int(box[2] / 999 * image_w), + int(box[3] / 999 * image_h), ) box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())] From ef9f91e2b6426536b83294c11742c27be66361b1 Mon Sep 17 00:00:00 2001 From: StevenLiuWen Date: Wed, 26 Feb 2025 13:03:37 +0800 Subject: [PATCH 11/11] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index aff4393..5ff7cf9 100644 --- a/README.md +++ b/README.md @@ -335,6 +335,9 @@ This is image_3: <|Assistant|>: The first image contains carrots. The second image contains corn. The third image contains meat.<|end▁of▁sentence|> ``` +Parse the bounding box coordinates, please refer to [parse_ref_bbox](https://github.com/deepseek-ai/DeepSeek-VL2/blob/main/deepseek_vl2/serve/app_modules/utils.py#L270-L298). + + ### Full Inference Example ```shell # without incremental prefilling