diff --git a/README.md b/README.md index c7591b0..5ff7cf9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
- DeepSeek LLM + DeepSeek AI

@@ -11,7 +11,7 @@ Homepage - + Chat @@ -53,7 +53,7 @@ ๐Ÿ“– Citation
๐Ÿ“„ Paper Link | ๐Ÿ“„ Arxiv Paper Link | - ๐Ÿ‘๏ธ Demo + ๐Ÿ‘๏ธ Demo

## 1. Introduction @@ -69,6 +69,8 @@ Zhiyu Wu*, Xiaokang Chen*, Zizheng Pan*, Xingchao Liu*, Wen Liu**, Damai Dai, Hu ![](./images/vl2_teaser.jpeg) ## 2. Release +โœ… 2025-2-6: Naive Implemented Gradio Demo on Huggingface Space [deepseek-vl2-small](https://huggingface.co/spaces/deepseek-ai/deepseek-vl2-small). + โœ… 2024-12-25: Gradio Demo Example, Incremental Prefilling and VLMEvalKit Support. โœ… 2024-12-13: DeepSeek-VL2 family released, including DeepSeek-VL2-tiny, DeepSeek-VL2-small, DeepSeek-VL2. @@ -333,6 +335,9 @@ This is image_3: <|Assistant|>: The first image contains carrots. The second image contains corn. The third image contains meat.<๏ฝœendโ–ofโ–sentence๏ฝœ> ``` +Parse the bounding box coordinates, please refer to [parse_ref_bbox](https://github.com/deepseek-ai/DeepSeek-VL2/blob/main/deepseek_vl2/serve/app_modules/utils.py#L270-L298). + + ### Full Inference Example ```shell # without incremental prefilling diff --git a/deepseek_vl2/models/siglip_vit.py b/deepseek_vl2/models/siglip_vit.py index 67f30e8..f06c25f 100644 --- a/deepseek_vl2/models/siglip_vit.py +++ b/deepseek_vl2/models/siglip_vit.py @@ -13,7 +13,6 @@ ) from timm.models._manipulate import named_apply, checkpoint_seq, adapt_input_conv from transformers.modeling_utils import is_flash_attn_2_available -from xformers.ops import memory_efficient_attention from functools import partial @@ -134,6 +133,8 @@ def __init__( self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0. else nn.Identity() def forward(self, x: torch.Tensor) -> torch.Tensor: + from xformers.ops import memory_efficient_attention + B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim) diff --git a/deepseek_vl2/serve/app_modules/utils.py b/deepseek_vl2/serve/app_modules/utils.py index 9a9b98d..2683fdf 100755 --- a/deepseek_vl2/serve/app_modules/utils.py +++ b/deepseek_vl2/serve/app_modules/utils.py @@ -270,7 +270,7 @@ def pil_to_base64( def parse_ref_bbox(response, image: Image.Image): try: image = image.copy() - image_h, image_w = image.size + image_w, image_h = image.size draw = ImageDraw.Draw(image) ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response) @@ -291,10 +291,10 @@ def parse_ref_bbox(response, image: Image.Image): for indice, (box, label) in enumerate(zip(boxes, labels)): box = ( - int(box[0] / 999 * image_h), - int(box[1] / 999 * image_w), - int(box[2] / 999 * image_h), - int(box[3] / 999 * image_w), + int(box[0] / 999 * image_w), + int(box[1] / 999 * image_h), + int(box[2] / 999 * image_w), + int(box[3] / 999 * image_h), ) box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())] diff --git a/deepseek_vl2/serve/inference.py b/deepseek_vl2/serve/inference.py index 9b445d6..4feaa97 100755 --- a/deepseek_vl2/serve/inference.py +++ b/deepseek_vl2/serve/inference.py @@ -22,7 +22,6 @@ import torch import transformers -from joblib.externals.cloudpickle import instance from transformers import ( AutoModelForCausalLM, StoppingCriteria, diff --git a/images/badge.svg b/images/badge.svg new file mode 100644 index 0000000..1551f56 --- /dev/null +++ b/images/badge.svg @@ -0,0 +1 @@ +DeepSeek: HomepageDeepSeekHomepage diff --git a/pyproject.toml b/pyproject.toml index 5f63b6a..fd6c463 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "torch==2.0.1", "transformers==4.38.2", "timm>=0.9.16", + "xformers>=0.0.21", "accelerate", "sentencepiece", "attrdict", diff --git a/requirements.txt b/requirements.txt index c7acd26..b89b147 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ torch==2.0.1 transformers==4.38.2 +xformers>=0.0.21 timm>=0.9.16 accelerate sentencepiece diff --git a/web_demo.py b/web_demo.py index 894aece..9210e68 100755 --- a/web_demo.py +++ b/web_demo.py @@ -118,8 +118,8 @@ [ [ "images/multi_image_1.jpeg", - "images/mi_2.jpeg", - "images/mi_3.jpeg" + "images/multi_image_2.jpeg", + "images/multi_image_3.jpeg" ], "่ƒฝๅธฎๆˆ‘็”จ่ฟ™ๅ‡ ไธช้ฃŸๆๅšไธ€้“่œๅ—?", ]