From 7adccdedb3e26bdc7449566e78026efa5883784d Mon Sep 17 00:00:00 2001 From: Tanvika Boyineni Date: Wed, 6 Aug 2025 11:42:00 -0700 Subject: [PATCH 1/4] fix: config logic support --- src/sagemaker/jumpstart/factory/estimator.py | 1 + src/sagemaker/jumpstart/factory/model.py | 12 ++++++++---- src/sagemaker/jumpstart/types.py | 8 ++++---- src/sagemaker/jumpstart/utils.py | 9 +++++++-- 4 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/sagemaker/jumpstart/factory/estimator.py b/src/sagemaker/jumpstart/factory/estimator.py index 051cda0f4a..92fdb4d6a1 100644 --- a/src/sagemaker/jumpstart/factory/estimator.py +++ b/src/sagemaker/jumpstart/factory/estimator.py @@ -837,6 +837,7 @@ def _add_config_name_to_kwargs( kwargs.config_name = kwargs.config_name or get_top_ranked_config_name( scope=JumpStartScriptScope.TRAINING, + instance_type=kwargs.instance_type, **get_model_info_default_kwargs(kwargs, include_config_name=False), ) diff --git a/src/sagemaker/jumpstart/factory/model.py b/src/sagemaker/jumpstart/factory/model.py index 53ded3f275..c5dab29c16 100644 --- a/src/sagemaker/jumpstart/factory/model.py +++ b/src/sagemaker/jumpstart/factory/model.py @@ -559,6 +559,7 @@ def _add_config_name_to_init_kwargs(kwargs: JumpStartModelInitKwargs) -> JumpSta kwargs.config_name = kwargs.config_name or get_top_ranked_config_name( **get_model_info_default_kwargs(kwargs, include_config_name=False), scope=JumpStartScriptScope.INFERENCE, + instance_type=kwargs.instance_type, ) if kwargs.config_name is None: @@ -618,6 +619,7 @@ def _add_config_name_to_deploy_kwargs( default_config_name = kwargs.config_name or get_top_ranked_config_name( **get_model_info_default_kwargs(kwargs, include_config_name=False), scope=JumpStartScriptScope.INFERENCE, + instance_type=kwargs.instance_type, ) kwargs.config_name = kwargs.config_name or default_config_name @@ -927,6 +929,12 @@ def get_init_kwargs( model_init_kwargs = _add_vulnerable_and_deprecated_status_to_kwargs(kwargs=model_init_kwargs) model_init_kwargs = _add_model_version_to_kwargs(kwargs=model_init_kwargs) + + # Add instance type before config selection so config compatibility can be checked + model_init_kwargs = _add_instance_type_to_kwargs( + kwargs=model_init_kwargs, disable_instance_type_logging=disable_instance_type_logging + ) + model_init_kwargs = _add_config_name_to_init_kwargs(kwargs=model_init_kwargs) model_init_kwargs = _add_sagemaker_session_with_custom_user_agent_to_kwargs( @@ -936,10 +944,6 @@ def get_init_kwargs( model_init_kwargs = _add_model_name_to_kwargs(kwargs=model_init_kwargs) - model_init_kwargs = _add_instance_type_to_kwargs( - kwargs=model_init_kwargs, disable_instance_type_logging=disable_instance_type_logging - ) - model_init_kwargs = _add_image_uri_to_kwargs(kwargs=model_init_kwargs) if hub_arn: diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py index 5b45b21bd8..b0cea6eb02 100644 --- a/src/sagemaker/jumpstart/types.py +++ b/src/sagemaker/jumpstart/types.py @@ -1723,10 +1723,10 @@ def get_top_config_from_ranking( ranked_config_names = rankings.rankings for config_name in ranked_config_names: resolved_config = self.configs[config_name].resolved_config - if instance_type and instance_type not in getattr( - resolved_config, instance_type_attribute - ): - continue + if instance_type: + supported_instance_types = getattr(resolved_config, instance_type_attribute, []) + if supported_instance_types and instance_type not in supported_instance_types: + continue return self.configs[config_name] return None diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py index 15f9e9b52e..b7c22bbda6 100644 --- a/src/sagemaker/jumpstart/utils.py +++ b/src/sagemaker/jumpstart/utils.py @@ -1233,9 +1233,14 @@ def get_top_ranked_config_name( tolerate_vulnerable_model: bool = False, hub_arn: Optional[str] = None, ranking_name: enums.JumpStartConfigRankingName = enums.JumpStartConfigRankingName.DEFAULT, + instance_type: Optional[str] = None, ) -> Optional[str]: """Returns the top ranked config name for the given model ID and region. + Args: + instance_type (Optional[str]): The instance type to filter configs by compatibility. + If provided, only configs that support this instance type will be considered. + Raises: ValueError: If the script scope is not supported by JumpStart. """ @@ -1254,7 +1259,7 @@ def get_top_ranked_config_name( if scope == enums.JumpStartScriptScope.INFERENCE: return ( model_specs.inference_configs.get_top_config_from_ranking( - ranking_name=ranking_name + ranking_name=ranking_name, instance_type=instance_type ).config_name if model_specs.inference_configs else None @@ -1262,7 +1267,7 @@ def get_top_ranked_config_name( if scope == enums.JumpStartScriptScope.TRAINING: return ( model_specs.training_configs.get_top_config_from_ranking( - ranking_name=ranking_name + ranking_name=ranking_name, instance_type=instance_type ).config_name if model_specs.training_configs else None From d6c280f729a78caa96c6812e10a0c8f08b457fc8 Mon Sep 17 00:00:00 2001 From: Tanvika Boyineni Date: Wed, 6 Aug 2025 12:30:08 -0700 Subject: [PATCH 2/4] fix: adding auto resolution config support --- specfileex | 2960 ++++++++++++++++++++++ src/sagemaker/jumpstart/factory/model.py | 10 +- src/sagemaker/jumpstart/types.py | 7 +- src/sagemaker/jumpstart/utils.py | 4 - test_unified_model_card.py | 193 ++ 5 files changed, 3163 insertions(+), 11 deletions(-) create mode 100644 specfileex create mode 100644 test_unified_model_card.py diff --git a/specfileex b/specfileex new file mode 100644 index 0000000000..e2d15647d0 --- /dev/null +++ b/specfileex @@ -0,0 +1,2960 @@ +{ + "model_id": "meta-textgeneration-llama-2-7b-f", + "provider": "meta", + "url": "https://ai.meta.com/resources/models-and-libraries/llama-downloads/", + "version": "4.19.0", + "min_sdk_version": "2.225.0", + "training_supported": true, + "incremental_training_supported": true, + "hosting_ecr_specs": { + "framework": "huggingface-llm", + "framework_version": "2.0.0", + "py_version": "py310" + }, + "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", + "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", + "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", + "hosting_prepacked_artifact_version": "1.1.0", + "hosting_use_script_uri": false, + "hosting_eula_key": "fmhMetadata/eula/llamaEula.txt", + "inference_vulnerable": false, + "inference_dependencies": [], + "inference_vulnerabilities": [], + "training_vulnerable": false, + "training_dependencies": [ + "accelerate==0.33.0", + "bitsandbytes==0.39.1", + "black==23.7.0", + "brotli==1.0.9", + "datasets==2.14.1", + "docstring-parser==0.16", + "fire==0.5.0", + "huggingface-hub==0.24.2", + "inflate64==0.3.1", + "loralib==0.1.1", + "multivolumefile==0.2.3", + "mypy-extensions==1.0.0", + "nvidia-cublas-cu12==12.1.3.1", + "nvidia-cuda-cupti-cu12==12.1.105", + "nvidia-cuda-nvrtc-cu12==12.1.105", + "nvidia-cuda-runtime-cu12==12.1.105", + "nvidia-cudnn-cu12==8.9.2.26", + "nvidia-cufft-cu12==11.0.2.54", + "nvidia-curand-cu12==10.3.2.106", + "nvidia-cusolver-cu12==11.4.5.107", + "nvidia-cusolver-cu12==11.4.5.107", + "nvidia-cusparse-cu12==12.1.0.106", + "nvidia-nccl-cu12==2.19.3", + "nvidia-nvjitlink-cu12==12.3.101", + "nvidia-nvtx-cu12==12.1.105", + "pathspec==0.11.1", + "peft==0.4.0", + "py7zr==0.20.5", + "pybcj==1.0.1", + "pycryptodomex==3.18.0", + "pyppmd==1.0.0", + "pyzstd==0.15.9", + "safetensors==0.4.2", + "sagemaker_jumpstart_huggingface_script_utilities==1.2.7", + "sagemaker_jumpstart_script_utilities==1.1.9", + "scipy==1.11.1", + "shtab==1.7.1", + "termcolor==2.3.0", + "texttable==1.6.7", + "tokenize-rt==5.1.0", + "tokenizers==0.19.1", + "torch==2.2.0", + "transformers==4.43.1", + "triton==2.2.0", + "trl==0.8.1", + "typing-extensions==4.8.0", + "tyro==0.7.3" + ], + "training_vulnerabilities": [], + "deprecated": false, + "hyperparameters": [ + { + "name": "int8_quantization", + "type": "text", + "default": "False", + "options": [ + "True", + "False" + ], + "scope": "algorithm" + }, + { + "name": "enable_fsdp", + "type": "text", + "default": "True", + "options": [ + "True", + "False" + ], + "scope": "algorithm" + }, + { + "name": "epoch", + "type": "int", + "default": 1, + "min": 1, + "max": 1000, + "scope": "algorithm" + }, + { + "name": "learning_rate", + "type": "float", + "default": 0.0001, + "min": 1e-08, + "max": 1, + "scope": "algorithm" + }, + { + "name": "lora_r", + "type": "int", + "default": 8, + "min": 1, + "scope": "algorithm" + }, + { + "name": "lora_alpha", + "type": "int", + "default": 32, + "min": 1, + "scope": "algorithm" + }, + { + "name": "target_modules", + "type": "text", + "default": "q_proj,v_proj", + "scope": "algorithm" + }, + { + "name": "lora_dropout", + "type": "float", + "default": 0.05, + "min": 0, + "max": 1, + "scope": "algorithm" + }, + { + "name": "instruction_tuned", + "type": "text", + "default": "False", + "options": [ + "True", + "False" + ], + "scope": "algorithm" + }, + { + "name": "chat_dataset", + "type": "text", + "default": "True", + "options": [ + "True", + "False" + ], + "scope": "algorithm" + }, + { + "name": "add_input_output_demarcation_key", + "type": "text", + "default": "True", + "options": [ + "True", + "False" + ], + "scope": "algorithm" + }, + { + "name": "per_device_train_batch_size", + "type": "int", + "default": 1, + "min": 1, + "max": 1000, + "scope": "algorithm" + }, + { + "name": "per_device_eval_batch_size", + "type": "int", + "default": 1, + "min": 1, + "max": 1000, + "scope": "algorithm" + }, + { + "name": "max_train_samples", + "type": "int", + "default": -1, + "min": -1, + "scope": "algorithm" + }, + { + "name": "max_val_samples", + "type": "int", + "default": -1, + "min": -1, + "scope": "algorithm" + }, + { + "name": "seed", + "type": "int", + "default": 10, + "min": 1, + "max": 1000, + "scope": "algorithm" + }, + { + "name": "max_input_length", + "type": "int", + "default": -1, + "min": -1, + "scope": "algorithm" + }, + { + "name": "validation_split_ratio", + "type": "float", + "default": 0.2, + "min": 0, + "max": 1, + "scope": "algorithm" + }, + { + "name": "train_data_split_seed", + "type": "int", + "default": 0, + "min": 0, + "scope": "algorithm" + }, + { + "name": "preprocessing_num_workers", + "type": "text", + "default": "None", + "scope": "algorithm" + }, + { + "name": "sagemaker_submit_directory", + "type": "text", + "default": "/opt/ml/input/data/code/sourcedir.tar.gz", + "scope": "container" + }, + { + "name": "sagemaker_program", + "type": "text", + "default": "transfer_learning.py", + "scope": "container" + }, + { + "name": "sagemaker_container_log_level", + "type": "text", + "default": "20", + "scope": "container" + } + ], + "training_script_key": "source-directory-tarballs/training/meta-textgeneration/v1.2.0/sourcedir.tar.gz", + "training_prepacked_script_key": "source-directory-tarballs/training/meta-textgeneration/prepack/inference-meta-textgeneration/v1.2.0/sourcedir.tar.gz", + "training_prepacked_script_version": "1.2.0", + "training_ecr_specs": { + "framework": "huggingface", + "framework_version": "2.0.0", + "py_version": "py310", + "huggingface_transformers_version": "4.28.1" + }, + "training_artifact_key": "meta-training/v1.1.0/train-meta-textgeneration-llama-2-7b-f.tar.gz", + "inference_environment_variables": [ + { + "name": "SAGEMAKER_PROGRAM", + "type": "text", + "default": "inference.py", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_SUBMIT_DIRECTORY", + "type": "text", + "default": "/opt/ml/model/code", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", + "type": "text", + "default": "20", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", + "type": "text", + "default": "3600", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "ENDPOINT_SERVER_TIMEOUT", + "type": "int", + "default": 3600, + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MODEL_CACHE_ROOT", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_ENV", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "HF_MODEL_ID", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "OPTION_GPU_MEMORY_UTILIZATION", + "type": "text", + "default": "0.85", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SM_NUM_GPUS", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_INPUT_LENGTH", + "type": "text", + "default": "4095", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_TOTAL_TOKENS", + "type": "text", + "default": "4096", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_BATCH_PREFILL_TOKENS", + "type": "text", + "default": "8192", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_CONCURRENT_REQUESTS", + "type": "text", + "default": "512", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_MODEL_SERVER_WORKERS", + "type": "int", + "default": 1, + "scope": "container", + "required_for_model_class": true + } + ], + "metrics": [ + { + "Name": "huggingface-textgeneration:eval-loss", + "Regex": "eval_epoch_loss=tensor\\(([0-9\\.]+)" + }, + { + "Name": "huggingface-textgeneration:eval-ppl", + "Regex": "eval_ppl=tensor\\(([0-9\\.]+)" + }, + { + "Name": "huggingface-textgeneration:train-loss", + "Regex": "train_epoch_loss=([0-9\\.]+)" + } + ], + "default_inference_instance_type": "ml.g5.12xlarge", + "supported_inference_instance_types": [ + "ml.g5.12xlarge", + "ml.g5.24xlarge", + "ml.g5.2xlarge", + "ml.g5.48xlarge", + "ml.g5.4xlarge", + "ml.g5.8xlarge", + "ml.g6.12xlarge", + "ml.p4d.24xlarge" + ], + "default_training_instance_type": "ml.g5.12xlarge", + "supported_training_instance_types": [ + "ml.g5.12xlarge", + "ml.g5.24xlarge", + "ml.g5.48xlarge", + "ml.p3dn.24xlarge", + "ml.g4dn.12xlarge" + ], + "model_kwargs": {}, + "estimator_kwargs": { + "encrypt_inter_container_traffic": true, + "disable_output_compression": true, + "max_run": 360000 + }, + "fit_kwargs": {}, + "inference_volume_size": 256, + "training_volume_size": 256, + "inference_enable_network_isolation": true, + "training_enable_network_isolation": true, + "default_training_dataset_key": "training-datasets/oasst_top/train/", + "validation_supported": true, + "fine_tuning_supported": true, + "resource_name_base": "meta-textgeneration-llama-2-7b-f", + "gated_bucket": true, + "training_instance_type_variants": { + "regional_aliases": { + "af-south-1": { + "gpu_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-east-1": { + "gpu_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-northeast-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-northeast-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-northeast-3": { + "gpu_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-south-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-southeast-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-southeast-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ap-southeast-3": { + "gpu_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ca-central-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "ca-west-1": { + "gpu_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "cn-north-1": { + "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "cn-northwest-1": { + "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "eu-central-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "eu-north-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "eu-south-1": { + "gpu_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "eu-west-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "eu-west-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "eu-west-3": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "il-central-1": { + "gpu_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "me-central-1": { + "gpu_ecr_uri_1": "914824155844.dkr.ecr.me-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "me-south-1": { + "gpu_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "sa-east-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "us-east-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "us-east-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "us-gov-east-1": { + "gpu_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "us-gov-west-1": { + "gpu_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "us-west-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + }, + "us-west-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" + } + }, + "variants": { + "g4dn": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + }, + "properties": { + "gated_model_key_env_var_value": "meta-training/g4dn/v1.0.0/train-meta-textgeneration-llama-2-7b-f.tar.gz" + } + }, + "g5": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + }, + "properties": { + "gated_model_key_env_var_value": "meta-training/g5/v1.0.0/train-meta-textgeneration-llama-2-7b-f.tar.gz" + } + }, + "g6": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "g6e": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "local_gpu": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p2": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p3": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p3dn": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + }, + "properties": { + "gated_model_key_env_var_value": "meta-training/p3dn/v1.0.0/train-meta-textgeneration-llama-2-7b-f.tar.gz" + } + }, + "p4d": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p4de": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p5": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p5e": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p5en": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p6": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p6e": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + } + } + }, + "hosting_artifact_s3_data_type": "S3Prefix", + "hosting_artifact_compression_type": "None", + "dynamic_container_deployment_supported": true, + "inference_configs": { + "tgi": { + "component_names": [ + "tgi" + ] + }, + "lmi": { + "component_names": [ + "lmi" + ], + "benchmark_metrics": { + "ml.g6.12xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.19", + "concurrency": "16" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "19.7", + "concurrency": "16" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.22", + "concurrency": "32" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "11.6", + "concurrency": "32" + } + ], + "ml.p4d.24xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "2.58", + "concurrency": "256" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "3448.3", + "concurrency": "256" + } + ] + } + }, + "lmi-optimized": { + "component_names": [ + "lmi-optimized" + ], + "benchmark_metrics": { + "ml.g5.12xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.23", + "concurrency": "1" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "156.2", + "concurrency": "1" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.25", + "concurrency": "2" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "93.1", + "concurrency": "2" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.27", + "concurrency": "4" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "58.2", + "concurrency": "4" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.29", + "concurrency": "8" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "31.0", + "concurrency": "8" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.42", + "concurrency": "16" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "15.2", + "concurrency": "16" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.58", + "concurrency": "32" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "8.0", + "concurrency": "32" + }, + { + "name": "latency", + "unit": "sec", + "value": "2.42", + "concurrency": "128" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "4.6", + "concurrency": "128" + } + ], + "ml.g5.2xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.19", + "concurrency": "1" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "66.9", + "concurrency": "1" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.19", + "concurrency": "2" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "55.5", + "concurrency": "2" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.22", + "concurrency": "4" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "41.8", + "concurrency": "4" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.44", + "concurrency": "8" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "31.3", + "concurrency": "8" + }, + { + "name": "latency", + "unit": "sec", + "value": "2.87", + "concurrency": "16" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "71.1", + "concurrency": "16" + } + ], + "ml.g6.12xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.16", + "concurrency": "1" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "107.1", + "concurrency": "1" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.17", + "concurrency": "2" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "79.5", + "concurrency": "2" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.19", + "concurrency": "4" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "55.1", + "concurrency": "4" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.21", + "concurrency": "8" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "34.4", + "concurrency": "8" + }, + { + "name": "latency", + "unit": "sec", + "value": "3.75", + "concurrency": "64" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "8.3", + "concurrency": "64" + } + ], + "ml.g6.2xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.23", + "concurrency": "1" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "38.2", + "concurrency": "1" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.30", + "concurrency": "2" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "32.9", + "concurrency": "2" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.30", + "concurrency": "4" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "24.5", + "concurrency": "4" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.60", + "concurrency": "8" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "21.0", + "concurrency": "8" + }, + { + "name": "latency", + "unit": "sec", + "value": "4.19", + "concurrency": "16" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "50.0", + "concurrency": "16" + } + ], + "ml.p4d.24xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.06", + "concurrency": "1" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "150.2", + "concurrency": "1" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.06", + "concurrency": "2" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "149.0", + "concurrency": "2" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.06", + "concurrency": "4" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "149.0", + "concurrency": "4" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.06", + "concurrency": "8" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "141.0", + "concurrency": "8" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.06", + "concurrency": "16" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "128.9", + "concurrency": "16" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.06", + "concurrency": "32" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "105.2", + "concurrency": "32" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.07", + "concurrency": "64" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "73.9", + "concurrency": "64" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.37", + "concurrency": "128" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "68.4", + "concurrency": "128" + }, + { + "name": "latency", + "unit": "sec", + "value": "4.58", + "concurrency": "512" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "11111.1", + "concurrency": "512" + } + ], + "ml.p5.48xlarge": [ + { + "name": "latency", + "unit": "sec", + "value": "0.04", + "concurrency": "1" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "253.2", + "concurrency": "1" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.03", + "concurrency": "2" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "257.1", + "concurrency": "2" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.03", + "concurrency": "4" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "252.5", + "concurrency": "4" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.03", + "concurrency": "8" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "236.4", + "concurrency": "8" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.04", + "concurrency": "16" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "213.2", + "concurrency": "16" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.04", + "concurrency": "32" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "178.6", + "concurrency": "32" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.04", + "concurrency": "64" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "129.0", + "concurrency": "64" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.04", + "concurrency": "128" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "81.2", + "concurrency": "128" + }, + { + "name": "latency", + "unit": "sec", + "value": "0.33", + "concurrency": "256" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "74.5", + "concurrency": "256" + }, + { + "name": "latency", + "unit": "sec", + "value": "1.77", + "concurrency": "512" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "724.6", + "concurrency": "512" + }, + { + "name": "latency", + "unit": "sec", + "value": "2.96", + "concurrency": "768" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "6666.7", + "concurrency": "768" + }, + { + "name": "latency", + "unit": "sec", + "value": "2.22", + "concurrency": "1024" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "5882.4", + "concurrency": "1024" + }, + { + "name": "latency", + "unit": "sec", + "value": "3.88", + "concurrency": "1280" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "11111.1", + "concurrency": "1280" + }, + { + "name": "latency", + "unit": "sec", + "value": "3.99", + "concurrency": "1536" + }, + { + "name": "throughput", + "unit": "tokens/sec", + "value": "11111.1", + "concurrency": "1536" + } + ] + }, + "acceleration_configs": [ + { + "type": "Compilation", + "enabled": false + }, + { + "type": "Speculative-Decoding", + "enabled": true + }, + { + "type": "Quantization", + "enabled": false + } + ] + }, + "neuron": { + "component_names": [ + "neuron" + ] + } + }, + "inference_config_components": { + "tgi": { + "hosting_ecr_specs": { + "framework": "huggingface-llm", + "framework_version": "2.0.0", + "py_version": "py310" + }, + "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", + "hosting_use_script_uri": false, + "inference_dependencies": [], + "inference_vulnerable": false, + "inference_vulnerabilities": [], + "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", + "hosting_prepacked_artifact_version": "1.1.0", + "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", + "hosting_artifact_s3_data_type": "S3Prefix", + "hosting_artifact_compression_type": "None", + "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", + "hosting_neuron_model_version": "1.0.0", + "model_kwargs": {}, + "deploy_kwargs": { + "model_data_download_timeout": 1200, + "container_startup_health_check_timeout": 1200 + }, + "predictor_specs": { + "supported_content_types": [ + "application/json" + ], + "supported_accept_types": [ + "application/json" + ], + "default_content_type": "application/json", + "default_accept_type": "application/json" + }, + "default_inference_instance_type": "ml.g5.12xlarge", + "supported_inference_instance_types": [ + "ml.g5.12xlarge", + "ml.g5.24xlarge", + "ml.g5.2xlarge", + "ml.g5.48xlarge", + "ml.g5.4xlarge", + "ml.g5.8xlarge", + "ml.g6.12xlarge", + "ml.p4d.24xlarge" + ], + "hosting_instance_type_variants": { + "regional_aliases": { + "af-south-1": { + "gpu_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-east-1": { + "gpu_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-east-2": { + "gpu_ecr_uri_1": "975050140332.dkr.ecr.ap-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-northeast-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-northeast-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-northeast-3": { + "gpu_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-south-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-south-2": { + "gpu_ecr_uri_1": "772153158452.dkr.ecr.ap-south-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-southeast-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-southeast-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-southeast-3": { + "gpu_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-southeast-4": { + "gpu_ecr_uri_1": "457447274322.dkr.ecr.ap-southeast-4.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-southeast-5": { + "gpu_ecr_uri_1": "550225433462.dkr.ecr.ap-southeast-5.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ap-southeast-7": { + "gpu_ecr_uri_1": "590183813437.dkr.ecr.ap-southeast-7.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ca-central-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "ca-west-1": { + "gpu_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "cn-north-1": { + "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "cn-northwest-1": { + "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-central-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-central-2": { + "gpu_ecr_uri_1": "380420809688.dkr.ecr.eu-central-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-north-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-south-1": { + "gpu_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-south-2": { + "gpu_ecr_uri_1": "503227376785.dkr.ecr.eu-south-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-west-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-west-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "eu-west-3": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "il-central-1": { + "gpu_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "me-central-1": { + "gpu_ecr_uri_1": "914824155844.dkr.ecr.me-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "me-south-1": { + "gpu_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "mx-central-1": { + "gpu_ecr_uri_1": "637423239942.dkr.ecr.mx-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "sa-east-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "us-east-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "us-east-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "us-gov-east-1": { + "gpu_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "us-gov-west-1": { + "gpu_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "us-west-1": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + }, + "us-west-2": { + "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" + } + }, + "variants": { + "g4dn": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "g5": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "g6": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "g6e": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "local_gpu": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p2": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p3": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p3dn": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p4d": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p4de": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p5": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p5e": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p5en": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p6": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "p6e": { + "regional_properties": { + "image_uri": "$gpu_ecr_uri_1" + } + }, + "ml.g5.12xlarge": { + "properties": { + "environment_variables": { + "SM_NUM_GPUS": "4", + "MAX_BATCH_PREFILL_TOKENS": "16384" + }, + "resource_requirements": { + "min_memory_mb": 98304, + "num_accelerators": 4 + } + } + }, + "ml.g5.24xlarge": { + "properties": { + "environment_variables": { + "SM_NUM_GPUS": "4" + }, + "resource_requirements": { + "min_memory_mb": 196608, + "num_accelerators": 4 + } + } + }, + "ml.g5.48xlarge": { + "properties": { + "environment_variables": { + "SM_NUM_GPUS": "8" + }, + "resource_requirements": { + "min_memory_mb": 393216, + "num_accelerators": 8 + } + } + }, + "ml.p4d.24xlarge": { + "properties": { + "environment_variables": { + "SM_NUM_GPUS": "8", + "MAX_BATCH_PREFILL_TOKENS": "16384" + }, + "resource_requirements": { + "min_memory_mb": 589824, + "num_accelerators": 8 + } + } + }, + "ml.p5.48xlarge": { + "properties": { + "environment_variables": { + "OPTION_GPU_MEMORY_UTILIZATION": "0.95" + } + } + }, + "ml.g5.2xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 16384, + "num_accelerators": 1 + } + } + }, + "ml.g5.4xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 32768, + "num_accelerators": 1 + } + } + }, + "ml.g5.8xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 65536, + "num_accelerators": 1 + } + } + } + } + }, + "inference_volume_size": 256, + "inference_enable_network_isolation": true, + "hosting_resource_requirements": { + "min_memory_mb": 98304, + "num_accelerators": 4 + }, + "inference_environment_variables": [ + { + "name": "SAGEMAKER_PROGRAM", + "type": "text", + "default": "inference.py", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_SUBMIT_DIRECTORY", + "type": "text", + "default": "/opt/ml/model/code", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", + "type": "text", + "default": "20", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", + "type": "text", + "default": "3600", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "ENDPOINT_SERVER_TIMEOUT", + "type": "int", + "default": 3600, + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MODEL_CACHE_ROOT", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_ENV", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "HF_MODEL_ID", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "OPTION_GPU_MEMORY_UTILIZATION", + "type": "text", + "default": "0.85", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SM_NUM_GPUS", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_INPUT_LENGTH", + "type": "text", + "default": "4095", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_TOTAL_TOKENS", + "type": "text", + "default": "4096", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_BATCH_PREFILL_TOKENS", + "type": "text", + "default": "8192", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MAX_CONCURRENT_REQUESTS", + "type": "text", + "default": "512", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_MODEL_SERVER_WORKERS", + "type": "int", + "default": 1, + "scope": "container", + "required_for_model_class": true + } + ], + "default_payloads": { + "pingExponentialBackoff": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text", + "input_logprobs": "[0].details.prefill[*].logprob" + }, + "body": { + "inputs": "import socket\n\ndef ping_exponential_backoff(host: str):", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.2, + "decoder_input_details": true, + "details": true + } + } + }, + "argparse": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text" + }, + "body": { + "inputs": "import argparse\n\ndef main(string: str):\n print(string)\n print(string[::-1])\n\nif __name__ == \"__main__\":", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05 + } + } + }, + "Fibonacci": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text", + "input_logprobs": "[0].details.prefill[*].logprob" + }, + "body": { + "inputs": "def fib(n):\n", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.2, + "decoder_input_details": true, + "details": true + } + } + }, + "removeNonAscii": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text", + "input_logprobs": "[0].details.prefill[*].logprob" + }, + "body": { + "inputs": "def remove_non_ascii(s: str) -> str:\n \"\"\"\n return result\n", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05, + "decoder_input_details": true, + "details": true + } + } + }, + "installationInstructions": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text" + }, + "body": { + "inputs": "# Installation instructions:\n ```bash\n\n ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05 + } + } + }, + "interfaceManager": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text" + }, + "body": { + "inputs": "class InterfaceManagerFactory(AbstractManagerFactory):\n def __init__(\ndef main():\n factory = InterfaceManagerFactory(start=datetime.now())\n managers = []\n for i in range(10):\n managers.append(factory.build(id=i))\n", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05 + } + } + }, + "quasiPrefunctoid": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text" + }, + "body": { + "inputs": "/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\n π₁ P = 0 ↔ = 0 :=\nbegin\n split,\n { intros h f,\n rw pi_1_etalisation at h,\n simp [h],\n refl\n },\n { intro h,\n have := @quasi_adjoint C D P,\n simp [←pi_1_etalisation, this, h],\n refl\n }\nend\n", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05 + } + } + }, + "bashListTextFiles": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text", + "input_logprobs": "[0].details.prefill[*].logprob" + }, + "body": { + "inputs": "[INST] In Bash, how do I list all text files in the current directory (excluding subdirectories) that have been modified in the last month? [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05, + "decoder_input_details": true, + "details": true + } + } + }, + "inorderPreorderTraversal": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text" + }, + "body": { + "inputs": "[INST] What is the difference between inorder and preorder traversal? Give an example in Python. [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05 + } + } + }, + "contiguousSublists": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "[0].generated_text" + }, + "body": { + "inputs": "[INST] <>\nProvide answers in JavaScript\n<>\n\nWrite a function that computes the set of sums of all contiguous sublists of a given list. [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.05 + } + } + } + } + }, + "lmi": { + "hosting_ecr_specs": { + "framework": "djl-deepspeed", + "framework_version": "0.27.0", + "py_version": "py310" + }, + "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", + "hosting_use_script_uri": false, + "inference_dependencies": [], + "inference_vulnerable": false, + "inference_vulnerabilities": [], + "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", + "hosting_prepacked_artifact_version": "1.1.0", + "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", + "hosting_artifact_s3_data_type": "S3Prefix", + "hosting_artifact_compression_type": "None", + "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", + "hosting_neuron_model_version": "1.0.0", + "model_kwargs": {}, + "deploy_kwargs": { + "model_data_download_timeout": 1200, + "container_startup_health_check_timeout": 1200 + }, + "predictor_specs": { + "supported_content_types": [ + "application/json" + ], + "supported_accept_types": [ + "application/json" + ], + "default_content_type": "application/json", + "default_accept_type": "application/json" + }, + "default_inference_instance_type": "ml.g5.12xlarge", + "supported_inference_instance_types": [ + "ml.g5.12xlarge", + "ml.g5.24xlarge", + "ml.g5.2xlarge", + "ml.g5.48xlarge", + "ml.g5.4xlarge", + "ml.g5.8xlarge", + "ml.g6.12xlarge", + "ml.p4d.24xlarge" + ], + "hosting_instance_type_variants": { + "regional_aliases": { + "af-south-1": { + "alias_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-east-1": { + "alias_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-northeast-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-northeast-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-northeast-3": { + "alias_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-south-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-southeast-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-southeast-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ap-southeast-3": { + "alias_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ca-central-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "ca-west-1": { + "alias_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "cn-north-1": { + "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "cn-northwest-1": { + "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "eu-central-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "eu-north-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "eu-south-1": { + "alias_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "eu-west-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "eu-west-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "eu-west-3": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "il-central-1": { + "alias_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "me-south-1": { + "alias_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "sa-east-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "us-east-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "us-east-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "us-gov-east-1": { + "alias_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "us-gov-west-1": { + "alias_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "us-west-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + }, + "us-west-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" + } + }, + "variants": { + "g4dn": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "g5": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "g6": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "g6e": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "local_gpu": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p2": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p3": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p3dn": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p4d": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p4de": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p5": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p5e": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p5en": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p6": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p6e": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "ml.p4d.24xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "1" + }, + "resource_requirements": { + "min_memory_mb": 589824, + "num_accelerators": 8 + } + } + }, + "ml.p5.48xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "1", + "OPTION_GPU_MEMORY_UTILIZATION": "0.95" + } + } + }, + "ml.g5.2xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 16384, + "num_accelerators": 1 + } + } + }, + "ml.g5.4xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 32768, + "num_accelerators": 1 + } + } + }, + "ml.g5.8xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 65536, + "num_accelerators": 1 + } + } + }, + "ml.g5.12xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 98304, + "num_accelerators": 4 + } + } + }, + "ml.g5.24xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 196608, + "num_accelerators": 4 + } + } + }, + "ml.g5.48xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 393216, + "num_accelerators": 8 + } + } + } + } + }, + "inference_volume_size": 256, + "inference_enable_network_isolation": true, + "hosting_resource_requirements": { + "min_memory_mb": 98304, + "num_accelerators": 4 + }, + "inference_environment_variables": [ + { + "name": "SAGEMAKER_PROGRAM", + "type": "text", + "default": "inference.py", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_SUBMIT_DIRECTORY", + "type": "text", + "default": "/opt/ml/model/code", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", + "type": "text", + "default": "20", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", + "type": "text", + "default": "3600", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "ENDPOINT_SERVER_TIMEOUT", + "type": "int", + "default": 3600, + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MODEL_CACHE_ROOT", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_ENV", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "HF_MODEL_ID", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "OPTION_GPU_MEMORY_UTILIZATION", + "type": "text", + "default": "0.85", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_MODEL_SERVER_WORKERS", + "type": "int", + "default": 1, + "scope": "container", + "required_for_model_class": true + } + ], + "default_payloads": { + "meaningOfLife": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "I believe the meaning of life is", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6, + "decoder_input_details": true, + "details": true + } + } + }, + "theoryOfRelativity": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "Simply put, the theory of relativity states that ", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "teamMessage": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "A brief message congratulating the team on the launch:\n\nHi everyone,\n\nI just ", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "englishToFrench": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "Translate English to French:\nsea otter => loutre de mer\npeppermint => menthe poivrée\nplush girafe => girafe peluche\ncheese =>", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + } + } + }, + "lmi-optimized": { + "hosting_ecr_specs": { + "framework": "djl-lmi", + "framework_version": "0.28.0", + "py_version": "py310" + }, + "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", + "hosting_use_script_uri": false, + "inference_dependencies": [], + "inference_vulnerable": false, + "inference_vulnerabilities": [], + "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", + "hosting_prepacked_artifact_version": "1.1.0", + "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", + "hosting_artifact_s3_data_type": "S3Prefix", + "hosting_artifact_compression_type": "None", + "hosting_additional_data_sources": { + "speculative_decoding": [ + { + "channel_name": "draft_model", + "artifact_version": "v2", + "s3_data_source": { + "compression_type": "None", + "s3_data_type": "S3Prefix", + "s3_uri": "sagemaker-speculative-decoding-llama2-tiny-v2/" + } + } + ] + }, + "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", + "hosting_neuron_model_version": "1.0.0", + "model_kwargs": {}, + "deploy_kwargs": { + "model_data_download_timeout": 1200, + "container_startup_health_check_timeout": 1200 + }, + "predictor_specs": { + "supported_content_types": [ + "application/json" + ], + "supported_accept_types": [ + "application/json" + ], + "default_content_type": "application/json", + "default_accept_type": "application/json" + }, + "default_inference_instance_type": "ml.p4d.24xlarge", + "supported_inference_instance_types": [ + "ml.g5.12xlarge", + "ml.g5.2xlarge", + "ml.g6.12xlarge", + "ml.g6.2xlarge", + "ml.p4d.24xlarge", + "ml.p4de.24xlarge", + "ml.p5.48xlarge" + ], + "hosting_instance_type_variants": { + "regional_aliases": { + "af-south-1": { + "alias_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-east-1": { + "alias_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-northeast-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-northeast-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-northeast-3": { + "alias_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-south-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-southeast-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-southeast-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ap-southeast-3": { + "alias_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ca-central-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "ca-west-1": { + "alias_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "cn-north-1": { + "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "cn-northwest-1": { + "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "eu-central-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "eu-north-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "eu-south-1": { + "alias_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "eu-west-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "eu-west-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "eu-west-3": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "il-central-1": { + "alias_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "me-central-1": { + "alias_ecr_uri_1": "914824155844.dkr.ecr.me-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "me-south-1": { + "alias_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "sa-east-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "us-east-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "us-east-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "us-gov-east-1": { + "alias_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "us-gov-west-1": { + "alias_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "us-west-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + }, + "us-west-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" + } + }, + "variants": { + "g4dn": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "g5": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "g6": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "g6e": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "local_gpu": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p2": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p3": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p3dn": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p4d": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p4de": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p5": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p5e": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p5en": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p6": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "p6e": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "ml.p4d.24xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "1" + }, + "resource_requirements": { + "min_memory_mb": 589824, + "num_accelerators": 8 + } + } + }, + "ml.p5.48xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "1", + "OPTION_GPU_MEMORY_UTILIZATION": "0.95" + } + } + }, + "ml.p4de.24xlarge": { + "properties": { + "resource_requirements": { + "min_memory_mb": 589824, + "num_accelerators": 8 + } + } + } + } + }, + "inference_volume_size": 256, + "inference_enable_network_isolation": true, + "hosting_resource_requirements": { + "min_memory_mb": 589824, + "num_accelerators": 8 + }, + "inference_environment_variables": [ + { + "name": "SAGEMAKER_PROGRAM", + "type": "text", + "default": "inference.py", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_SUBMIT_DIRECTORY", + "type": "text", + "default": "/opt/ml/model/code", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", + "type": "text", + "default": "20", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", + "type": "text", + "default": "3600", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "ENDPOINT_SERVER_TIMEOUT", + "type": "int", + "default": 3600, + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MODEL_CACHE_ROOT", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_ENV", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "HF_MODEL_ID", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "OPTION_SPECULATIVE_DRAFT_MODEL", + "type": "text", + "default": "/opt/ml/additional-model-data-sources/draft_model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "OPTION_GPU_MEMORY_UTILIZATION", + "type": "text", + "default": "0.85", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_MODEL_SERVER_WORKERS", + "type": "int", + "default": 1, + "scope": "container", + "required_for_model_class": true + } + ], + "default_payloads": { + "meaningOfLife": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "I believe the meaning of life is", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "theoryOfRelativity": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "Simply put, the theory of relativity states that ", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "teamMessage": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "A brief message congratulating the team on the launch:\n\nHi everyone,\n\nI just ", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "englishToFrench": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "Translate English to French:\nsea otter => loutre de mer\npeppermint => menthe poivrée\nplush girafe => girafe peluche\ncheese =>", + "parameters": { + "max_new_tokens": 64, + "top_p": 0.9, + "temperature": 0.6 + } + } + } + } + }, + "neuron": { + "hosting_ecr_specs": { + "framework": "djl-neuronx", + "framework_version": "0.24.0", + "py_version": "py39" + }, + "hosting_script_key": "source-directory-tarballs/meta/inference/textgenerationneuron/v1.0.0/sourcedir.tar.gz", + "hosting_use_script_uri": false, + "inference_dependencies": [ + "sagemaker_jumpstart_huggingface_script_utilities==1.0.8", + "sagemaker_jumpstart_script_utilities==1.1.8" + ], + "inference_vulnerable": false, + "inference_vulnerabilities": [], + "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/neuron/inference/v1.0.0/", + "hosting_prepacked_artifact_version": "1.0.0", + "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/neuron/inference-prepack/v1.0.0/", + "hosting_artifact_s3_data_type": "S3Prefix", + "hosting_artifact_compression_type": "None", + "hosting_neuron_model_id": "meta-textgeneration-llama-2-7b-f", + "hosting_neuron_model_version": "1.0.0", + "model_kwargs": {}, + "deploy_kwargs": { + "model_data_download_timeout": 3600, + "container_startup_health_check_timeout": 3600 + }, + "predictor_specs": { + "supported_content_types": [ + "application/json" + ], + "supported_accept_types": [ + "application/json" + ], + "default_content_type": "application/json", + "default_accept_type": "application/json" + }, + "default_inference_instance_type": "ml.inf2.xlarge", + "supported_inference_instance_types": [ + "ml.inf2.xlarge", + "ml.inf2.8xlarge", + "ml.inf2.24xlarge", + "ml.inf2.48xlarge" + ], + "hosting_instance_type_variants": { + "regional_aliases": { + "ap-northeast-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "ap-south-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "ap-southeast-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "ap-southeast-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "ap-southeast-5": { + "alias_ecr_uri_1": "550225433462.dkr.ecr.ap-southeast-5.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "ap-southeast-7": { + "alias_ecr_uri_1": "590183813437.dkr.ecr.ap-southeast-7.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "ca-west-1": { + "alias_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "eu-central-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "eu-central-2": { + "alias_ecr_uri_1": "380420809688.dkr.ecr.eu-central-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "eu-west-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "eu-west-3": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "mx-central-1": { + "alias_ecr_uri_1": "637423239942.dkr.ecr.mx-central-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "sa-east-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "us-east-1": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "us-east-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + }, + "us-west-2": { + "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" + } + }, + "variants": { + "inf2": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "trn1": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "trn1n": { + "regional_properties": { + "image_uri": "$alias_ecr_uri_1" + } + }, + "ml.inf2.xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + "OPTION_N_POSITIONS": "1024", + "OPTION_DTYPE": "fp16", + "OPTION_ROLLING_BATCH": "auto", + "OPTION_MAX_ROLLING_BATCH_SIZE": "1", + "OPTION_NEURON_OPTIMIZE_LEVEL": "2" + }, + "resource_requirements": { + "min_memory_mb": 8192, + "num_accelerators": 1 + } + } + }, + "ml.inf2.8xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "2", + "OPTION_N_POSITIONS": "2048", + "OPTION_DTYPE": "fp16", + "OPTION_ROLLING_BATCH": "auto", + "OPTION_MAX_ROLLING_BATCH_SIZE": "4", + "OPTION_NEURON_OPTIMIZE_LEVEL": "2" + }, + "resource_requirements": { + "min_memory_mb": 65536, + "num_accelerators": 1 + } + } + }, + "ml.inf2.24xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "12", + "OPTION_N_POSITIONS": "4096", + "OPTION_DTYPE": "fp16", + "OPTION_ROLLING_BATCH": "auto", + "OPTION_MAX_ROLLING_BATCH_SIZE": "4", + "OPTION_NEURON_OPTIMIZE_LEVEL": "2" + }, + "resource_requirements": { + "min_memory_mb": 196608, + "num_accelerators": 6 + } + } + }, + "ml.inf2.48xlarge": { + "properties": { + "environment_variables": { + "OPTION_TENSOR_PARALLEL_DEGREE": "24", + "OPTION_N_POSITIONS": "4096", + "OPTION_DTYPE": "fp16", + "OPTION_ROLLING_BATCH": "auto", + "OPTION_MAX_ROLLING_BATCH_SIZE": "4", + "OPTION_NEURON_OPTIMIZE_LEVEL": "2" + }, + "resource_requirements": { + "min_memory_mb": 393216, + "num_accelerators": 12 + } + } + } + } + }, + "inference_volume_size": 256, + "inference_enable_network_isolation": false, + "hosting_resource_requirements": { + "min_memory_mb": 8192, + "num_accelerators": 1 + }, + "inference_environment_variables": [ + { + "name": "SAGEMAKER_PROGRAM", + "type": "text", + "default": "inference.py", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_SUBMIT_DIRECTORY", + "type": "text", + "default": "/opt/ml/model/code", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", + "type": "text", + "default": "20", + "scope": "container", + "required_for_model_class": false + }, + { + "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", + "type": "text", + "default": "3600", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "ENDPOINT_SERVER_TIMEOUT", + "type": "int", + "default": 3600, + "scope": "container", + "required_for_model_class": true + }, + { + "name": "MODEL_CACHE_ROOT", + "type": "text", + "default": "/opt/ml/model", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_ENV", + "type": "text", + "default": "1", + "scope": "container", + "required_for_model_class": true + }, + { + "name": "SAGEMAKER_MODEL_SERVER_WORKERS", + "type": "int", + "default": 1, + "scope": "container", + "required_for_model_class": true + } + ], + "default_payloads": { + "mayonnaise": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "[INST] what is the recipe of mayonnaise? [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "parisTrip": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "[INST] I am going to Paris, what should I see? [/INST] Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:\n\n1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.\n2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.\n3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.\n\nThese are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world.[INST] What is so great about #1? [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "parisHaiku": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "[INST] <>\nAlways answer with Haiku\n<>\n\nI am going to Paris, what should I see? [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.6 + } + } + }, + "emojisBeijing": { + "content_type": "application/json", + "prompt_key": "inputs", + "output_keys": { + "generated_text": "generated_text" + }, + "body": { + "inputs": "[INST] <>\nAlways answer with detailed instruction\n<>\n\nHow to go from Beijing to NY? [/INST] ", + "parameters": { + "max_new_tokens": 256, + "top_p": 0.9, + "temperature": 0.6 + } + } + } + } + } + }, + "inference_config_rankings": { + "overall": { + "description": "default", + "rankings": [ + "tgi", + "lmi", + "lmi-optimized", + "neuron" + ] + } + }, + "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", + "hosting_neuron_model_version": "1.0.0" +} \ No newline at end of file diff --git a/src/sagemaker/jumpstart/factory/model.py b/src/sagemaker/jumpstart/factory/model.py index c5dab29c16..bb60db68c5 100644 --- a/src/sagemaker/jumpstart/factory/model.py +++ b/src/sagemaker/jumpstart/factory/model.py @@ -929,12 +929,6 @@ def get_init_kwargs( model_init_kwargs = _add_vulnerable_and_deprecated_status_to_kwargs(kwargs=model_init_kwargs) model_init_kwargs = _add_model_version_to_kwargs(kwargs=model_init_kwargs) - - # Add instance type before config selection so config compatibility can be checked - model_init_kwargs = _add_instance_type_to_kwargs( - kwargs=model_init_kwargs, disable_instance_type_logging=disable_instance_type_logging - ) - model_init_kwargs = _add_config_name_to_init_kwargs(kwargs=model_init_kwargs) model_init_kwargs = _add_sagemaker_session_with_custom_user_agent_to_kwargs( @@ -944,6 +938,10 @@ def get_init_kwargs( model_init_kwargs = _add_model_name_to_kwargs(kwargs=model_init_kwargs) + model_init_kwargs = _add_instance_type_to_kwargs( + kwargs=model_init_kwargs, disable_instance_type_logging=disable_instance_type_logging + ) + model_init_kwargs = _add_image_uri_to_kwargs(kwargs=model_init_kwargs) if hub_arn: diff --git a/src/sagemaker/jumpstart/types.py b/src/sagemaker/jumpstart/types.py index b0cea6eb02..379d1c0545 100644 --- a/src/sagemaker/jumpstart/types.py +++ b/src/sagemaker/jumpstart/types.py @@ -1724,7 +1724,12 @@ def get_top_config_from_ranking( for config_name in ranked_config_names: resolved_config = self.configs[config_name].resolved_config if instance_type: - supported_instance_types = getattr(resolved_config, instance_type_attribute, []) + # Handle both dict and object types for resolved_config + if isinstance(resolved_config, dict): + supported_instance_types = resolved_config.get(instance_type_attribute, []) + else: + supported_instance_types = getattr(resolved_config, instance_type_attribute, []) + if supported_instance_types and instance_type not in supported_instance_types: continue return self.configs[config_name] diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py index b7c22bbda6..6396390fbd 100644 --- a/src/sagemaker/jumpstart/utils.py +++ b/src/sagemaker/jumpstart/utils.py @@ -1237,10 +1237,6 @@ def get_top_ranked_config_name( ) -> Optional[str]: """Returns the top ranked config name for the given model ID and region. - Args: - instance_type (Optional[str]): The instance type to filter configs by compatibility. - If provided, only configs that support this instance type will be considered. - Raises: ValueError: If the script scope is not supported by JumpStart. """ diff --git a/test_unified_model_card.py b/test_unified_model_card.py new file mode 100644 index 0000000000..475d46afdb --- /dev/null +++ b/test_unified_model_card.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +import json +import os +import sys +import boto3 +import time +from datetime import datetime +# from urllib.parse import urlparse +from unittest.mock import patch + +os.environ['HUGGING_FACE_HUB_TOKEN'] = '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' +os.environ['AWS_DEFAULT_REGION'] = 'us-west-2' + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +from sagemaker.jumpstart.model import JumpStartModel +from sagemaker.jumpstart.types import JumpStartModelSpecs +from sagemaker.jumpstart.enums import JumpStartModelType + + +def check_aws_account(): + """Check which AWS account and region we're using.""" + try: + sts_client = boto3.client('sts') + identity = sts_client.get_caller_identity() + + account_id = identity['Account'] + user_arn = identity['Arn'] + region = boto3.Session().region_name or 'us-west-2' + + print(f" AWS Account: {account_id}") + print(f" User/Role: {user_arn}") + print(f" Region: {region}") + print() + + return account_id, region + except Exception as e: + print(f" Error checking AWS account: {e}") + return None, None + + +def monitor_endpoint(endpoint_name, region='us-west-2'): + """Monitor endpoint deployment progress.""" + sagemaker_client = boto3.client('sagemaker', region_name=region) + + print(f" Monitoring endpoint: {endpoint_name}") + start_time = time.time() + + while True: + try: + response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) + status = response['EndpointStatus'] + elapsed = int(time.time() - start_time) + + print(f" [{elapsed//60}m {elapsed%60}s] {endpoint_name}: {status}") + + if status == 'InService': + print(f" {endpoint_name} is ready! (took {elapsed//60}m {elapsed%60}s)") + break + elif status == 'Failed': + print(f" {endpoint_name} deployment failed!") + print(f"Failure reason: {response.get('FailureReason', 'Unknown')}") + break + + except Exception as e: + print(f"Error checking {endpoint_name}: {e}") + + time.sleep(30) # Check every 30 seconds + +def load_custom_spec(): + """Load the custom spec file from src/sagemaker directory.""" + spec_path = os.path.join(os.path.dirname(__file__), 'specfileex') + with open(spec_path, 'r') as f: + return json.load(f) + + +# Check AWS account +account_id, region = check_aws_account() + +custom_spec = load_custom_spec() +mock_specs = JumpStartModelSpecs(custom_spec) + +with patch('sagemaker.jumpstart.cache.JumpStartModelsCache.get_specs') as mock_get_specs, \ + patch('sagemaker.jumpstart.utils.validate_model_id_and_get_type') as mock_validate_model: + + mock_get_specs.return_value = mock_specs + mock_validate_model.return_value = JumpStartModelType.OPEN_WEIGHTS + + model_id = "meta-textgeneration-llama-2-7b-f" + model_version = "4.19.0" + accept_eula = False + + # Create unique endpoint names with timestamp + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + neuron_endpoint_name = f"llama-neuron-{timestamp}" + gpu_endpoint_name = f"llama-gpu-{timestamp}" + + print(f" Neuron endpoint: {neuron_endpoint_name}") + print(f" GPU endpoint: {gpu_endpoint_name}") + print() + + + model_neuron = JumpStartModel( + model_id=model_id, + model_version=model_version, + instance_type="ml.inf2.24xlarge", + env={"HUGGING_FACE_HUB_TOKEN": "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"} + ) + + # Modify to use alpha us-west-2 bucket + original_neuron_uri = model_neuron.model_data['S3DataSource']['S3Uri'] + # Replace with alpha us-west-2 bucket (handle both east-1 and west-2 original buckets) + alpha_neuron_uri = original_neuron_uri.replace('jumpstart-private-cache-prod-us-east-1', 'jumpstart-private-cache-alpha-us-west-2') + alpha_neuron_uri = alpha_neuron_uri.replace('jumpstart-private-cache-prod-us-west-2', 'jumpstart-private-cache-alpha-us-west-2') + # Also handle regular cache buckets (without "private") + alpha_neuron_uri = alpha_neuron_uri.replace('jumpstart-cache-prod-us-east-1', 'jumpstart-cache-alpha-us-west-2') + alpha_neuron_uri = alpha_neuron_uri.replace('jumpstart-cache-prod-us-west-2', 'jumpstart-cache-alpha-us-west-2') + model_neuron.model_data['S3DataSource']['S3Uri'] = alpha_neuron_uri + print(f"Original neuron URI: {original_neuron_uri}") + print(f"Alpha neuron URI: {alpha_neuron_uri}") + print(model_neuron.model_data) + neuron_location = model_neuron.model_data['S3DataSource']['S3Uri'] + print(f"Neuron location: {neuron_location}") + + print("Deploying neuron model...") + neuron_predictor = model_neuron.deploy( + initial_instance_count=1, + instance_type="ml.inf2.24xlarge", + endpoint_name=neuron_endpoint_name, + accept_eula=True, + wait=False + ) + + # Monitor neuron deployment + monitor_endpoint(neuron_endpoint_name, 'us-west-2') + + + + model_gpu = JumpStartModel( + model_id=model_id, + model_version=model_version, + instance_type="ml.g5.12xlarge", + env={"HUGGING_FACE_HUB_TOKEN": "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"} + ) + + # Modify to use alpha us-west-2 bucket + original_gpu_uri = model_gpu.model_data['S3DataSource']['S3Uri'] + # Replace with alpha us-west-2 bucket (handle both east-1 and west-2 original buckets) + alpha_gpu_uri = original_gpu_uri.replace('jumpstart-private-cache-prod-us-east-1', 'jumpstart-private-cache-alpha-us-west-2') + alpha_gpu_uri = alpha_gpu_uri.replace('jumpstart-private-cache-prod-us-west-2', 'jumpstart-private-cache-alpha-us-west-2') + # Also handle regular cache buckets (without "private") + alpha_gpu_uri = alpha_gpu_uri.replace('jumpstart-cache-prod-us-east-1', 'jumpstart-cache-alpha-us-west-2') + alpha_gpu_uri = alpha_gpu_uri.replace('jumpstart-cache-prod-us-west-2', 'jumpstart-cache-alpha-us-west-2') + model_gpu.model_data['S3DataSource']['S3Uri'] = alpha_gpu_uri + print(f"Original GPU URI: {original_gpu_uri}") + print(f"Alpha GPU URI: {alpha_gpu_uri}") + print(model_gpu.model_data) + gpu_location = model_gpu.model_data['S3DataSource']['S3Uri'] + print(f"GPU location: {gpu_location}") + + print("Deploying GPU model...") + gpu_predictor = model_gpu.deploy( + initial_instance_count=1, + instance_type="ml.g5.12xlarge", + endpoint_name=gpu_endpoint_name, + accept_eula=True, + wait=False + ) + + # Monitor GPU deployment + monitor_endpoint(gpu_endpoint_name, 'us-west-2') + + test_payload = { + "inputs": "The meaning of life is", + "parameters": { + "max_new_tokens": 50, + "temperature": 0.7 + } + } + + print("Testing neuron endpoint...") + neuron_response = neuron_predictor.predict(test_payload) + print(f"Neuron response: {neuron_response}") + + print("Testing GPU endpoint...") + gpu_response = gpu_predictor.predict(test_payload) + print(f"GPU response: {gpu_response}") + + + #print("Cleaning up endpoints...") + #neuron_predictor.delete_endpoint() + #gpu_predictor.delete_endpoint() From 2976c5cd585e0b952acea068ae7b8f87751ed3a6 Mon Sep 17 00:00:00 2001 From: Tanvika Boyineni Date: Wed, 6 Aug 2025 15:21:51 -0700 Subject: [PATCH 3/4] tests: testing changes --- .../jumpstart/test_auto_config_resolution.py | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 tests/unit/sagemaker/jumpstart/test_auto_config_resolution.py diff --git a/tests/unit/sagemaker/jumpstart/test_auto_config_resolution.py b/tests/unit/sagemaker/jumpstart/test_auto_config_resolution.py new file mode 100644 index 0000000000..05d5dbc8df --- /dev/null +++ b/tests/unit/sagemaker/jumpstart/test_auto_config_resolution.py @@ -0,0 +1,236 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import +import pytest +from unittest.mock import patch, Mock +from sagemaker.jumpstart import utils +from sagemaker.jumpstart.enums import JumpStartScriptScope, JumpStartConfigRankingName +from sagemaker.jumpstart.factory.estimator import _add_config_name_to_kwargs +from sagemaker.jumpstart.factory.model import ( + _add_config_name_to_init_kwargs, + _add_config_name_to_deploy_kwargs, +) +from sagemaker.jumpstart.types import JumpStartEstimatorInitKwargs, JumpStartModelInitKwargs + + +class TestAutoConfigResolution: + """Test auto resolution of config names based on instance type.""" + + def create_mock_configs(self, scope): + """Create mock configs for testing with different supported instance types.""" + # Mock the config object structure + config1 = Mock() + config1.config_name = "config1" + config1.resolved_config = { + "supported_inference_instance_types": ["ml.g5.xlarge", "ml.g5.2xlarge"] + if scope == JumpStartScriptScope.INFERENCE + else [], + "supported_training_instance_types": ["ml.g5.xlarge", "ml.g5.2xlarge"] + if scope == JumpStartScriptScope.TRAINING + else [], + } + + config2 = Mock() + config2.config_name = "config2" + config2.resolved_config = { + "supported_inference_instance_types": ["ml.p4d.24xlarge", "ml.p5.48xlarge"] + if scope == JumpStartScriptScope.INFERENCE + else [], + "supported_training_instance_types": ["ml.p4d.24xlarge", "ml.p5.48xlarge"] + if scope == JumpStartScriptScope.TRAINING + else [], + } + + # Config with no instance type restrictions + config3 = Mock() + config3.config_name = "config3" + config3.resolved_config = { + "supported_inference_instance_types": [] + if scope == JumpStartScriptScope.INFERENCE + else [], + "supported_training_instance_types": [] + if scope == JumpStartScriptScope.TRAINING + else [], + } + + # Mock config rankings + ranking = Mock() + ranking.rankings = ["config1", "config2", "config3"] + + # Mock the metadata configs container + configs = Mock() + configs.scope = scope + configs.configs = { + "config1": config1, + "config2": config2, + "config3": config3, + } + configs.config_rankings = {JumpStartConfigRankingName.DEFAULT: ranking} + + # Import the actual get_top_config_from_ranking method so we can test it + from sagemaker.jumpstart.types import JumpStartMetadataConfigs + configs.get_top_config_from_ranking = JumpStartMetadataConfigs.get_top_config_from_ranking.__get__(configs) + + return configs + + def test_get_top_config_from_ranking_with_matching_instance_type(self): + """Test that get_top_config_from_ranking returns config that supports the instance type.""" + configs = self.create_mock_configs(JumpStartScriptScope.INFERENCE) + + # Test with instance type that matches config1 + result = configs.get_top_config_from_ranking(instance_type="ml.g5.xlarge") + assert result is not None + assert result.config_name == "config1" + + # Test with instance type that matches config2 + result = configs.get_top_config_from_ranking(instance_type="ml.p4d.24xlarge") + assert result is not None + assert result.config_name == "config2" + + def test_get_top_config_from_ranking_with_no_matching_instance_type(self): + """Test behavior when no config supports the requested instance type.""" + configs = self.create_mock_configs(JumpStartScriptScope.INFERENCE) + + # Test with instance type that doesn't match any config + result = configs.get_top_config_from_ranking(instance_type="ml.m5.xlarge") + assert result is not None + assert result.config_name == "config3" # Should fall back to config with no restrictions + + def test_get_top_config_from_ranking_without_instance_type(self): + """Test that get_top_config_from_ranking returns first ranked config when no instance type specified.""" + configs = self.create_mock_configs(JumpStartScriptScope.INFERENCE) + + result = configs.get_top_config_from_ranking() + assert result is not None + assert result.config_name == "config1" # First in ranking + + def test_get_top_config_from_ranking_training_scope(self): + """Test get_top_config_from_ranking with training scope.""" + configs = self.create_mock_configs(JumpStartScriptScope.TRAINING) + + # Test with training instance type + result = configs.get_top_config_from_ranking(instance_type="ml.g5.xlarge") + assert result is not None + assert result.config_name == "config1" + + def test_get_top_config_from_ranking_with_object_resolved_config(self): + """Test get_top_config_from_ranking when resolved_config is an object (not dict).""" + # Create a mock object with getattr support + mock_resolved_config = Mock() + mock_resolved_config.supported_inference_instance_types = ["ml.g5.xlarge"] + + config = Mock() + config.config_name = "test_config" + config.resolved_config = mock_resolved_config + + ranking = Mock() + ranking.rankings = ["test_config"] + + configs = Mock() + configs.scope = JumpStartScriptScope.INFERENCE + configs.configs = {"test_config": config} + configs.config_rankings = {JumpStartConfigRankingName.DEFAULT: ranking} + + # Import the actual method + from sagemaker.jumpstart.types import JumpStartMetadataConfigs + configs.get_top_config_from_ranking = JumpStartMetadataConfigs.get_top_config_from_ranking.__get__(configs) + + result = configs.get_top_config_from_ranking(instance_type="ml.g5.xlarge") + assert result is not None + assert result.config_name == "test_config" + + def test_get_top_config_from_ranking_empty_supported_instance_types(self): + """Test behavior when config has empty supported_instance_types list.""" + config = Mock() + config.config_name = "empty_config" + config.resolved_config = { + "supported_inference_instance_types": [], + } + + ranking = Mock() + ranking.rankings = ["empty_config"] + + configs = Mock() + configs.scope = JumpStartScriptScope.INFERENCE + configs.configs = {"empty_config": config} + configs.config_rankings = {JumpStartConfigRankingName.DEFAULT: ranking} + + # Import the actual method + from sagemaker.jumpstart.types import JumpStartMetadataConfigs + configs.get_top_config_from_ranking = JumpStartMetadataConfigs.get_top_config_from_ranking.__get__(configs) + + # Should return config even with empty list (no restrictions) + result = configs.get_top_config_from_ranking(instance_type="ml.g5.xlarge") + assert result is not None + assert result.config_name == "empty_config" + + def test_instance_type_parameter_signature(self): + """Test that get_top_ranked_config_name function accepts instance_type parameter.""" + # Import and inspect the function signature + import inspect + from typing import Optional + sig = inspect.signature(utils.get_top_ranked_config_name) + + # Verify that instance_type parameter exists in the signature + assert "instance_type" in sig.parameters + + # Verify it's an optional parameter with None default + instance_type_param = sig.parameters["instance_type"] + assert instance_type_param.default is None + assert instance_type_param.annotation == Optional[str] + + def test_get_top_config_from_ranking_preserves_existing_config_name(self): + """Test that existing config_name is preserved when already specified.""" + mock_get_config = Mock(return_value="auto_selected") + + with patch("sagemaker.jumpstart.utils.get_top_ranked_config_name", mock_get_config): + kwargs = JumpStartEstimatorInitKwargs( + model_id="test-model", + instance_type="ml.g5.xlarge", + config_name="user_specified_config", + ) + + result = _add_config_name_to_kwargs(kwargs) + + # Should not call get_top_ranked_config_name when config_name already exists + mock_get_config.assert_not_called() + assert result.config_name == "user_specified_config" + + def test_config_ranking_respects_priority_with_instance_type_filter(self): + """Test that config ranking priority is respected when filtering by instance type.""" + # Create configs where config2 is ranked higher but config1 matches instance type + config1 = Mock() + config1.config_name = "config1" + config1.resolved_config = {"supported_inference_instance_types": ["ml.g5.xlarge"]} + + config2 = Mock() + config2.config_name = "config2" + config2.resolved_config = {"supported_inference_instance_types": ["ml.p4d.24xlarge"]} + + # Rank config2 higher than config1 + ranking = Mock() + ranking.rankings = ["config2", "config1"] + + configs = Mock() + configs.scope = JumpStartScriptScope.INFERENCE + configs.configs = {"config1": config1, "config2": config2} + configs.config_rankings = {JumpStartConfigRankingName.DEFAULT: ranking} + + # Import the actual method + from sagemaker.jumpstart.types import JumpStartMetadataConfigs + configs.get_top_config_from_ranking = JumpStartMetadataConfigs.get_top_config_from_ranking.__get__(configs) + + # Even though config2 is ranked higher, config1 should be returned because it matches instance type + result = configs.get_top_config_from_ranking(instance_type="ml.g5.xlarge") + assert result is not None + assert result.config_name == "config1" \ No newline at end of file From 934123d1b638f68bce210b11fade971a3dbe60b6 Mon Sep 17 00:00:00 2001 From: Tanvika Boyineni Date: Thu, 7 Aug 2025 11:11:37 -0700 Subject: [PATCH 4/4] chore: clean up files --- specfileex | 2960 ------------------------------------ test_unified_model_card.py | 193 --- 2 files changed, 3153 deletions(-) delete mode 100644 specfileex delete mode 100644 test_unified_model_card.py diff --git a/specfileex b/specfileex deleted file mode 100644 index e2d15647d0..0000000000 --- a/specfileex +++ /dev/null @@ -1,2960 +0,0 @@ -{ - "model_id": "meta-textgeneration-llama-2-7b-f", - "provider": "meta", - "url": "https://ai.meta.com/resources/models-and-libraries/llama-downloads/", - "version": "4.19.0", - "min_sdk_version": "2.225.0", - "training_supported": true, - "incremental_training_supported": true, - "hosting_ecr_specs": { - "framework": "huggingface-llm", - "framework_version": "2.0.0", - "py_version": "py310" - }, - "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", - "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", - "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", - "hosting_prepacked_artifact_version": "1.1.0", - "hosting_use_script_uri": false, - "hosting_eula_key": "fmhMetadata/eula/llamaEula.txt", - "inference_vulnerable": false, - "inference_dependencies": [], - "inference_vulnerabilities": [], - "training_vulnerable": false, - "training_dependencies": [ - "accelerate==0.33.0", - "bitsandbytes==0.39.1", - "black==23.7.0", - "brotli==1.0.9", - "datasets==2.14.1", - "docstring-parser==0.16", - "fire==0.5.0", - "huggingface-hub==0.24.2", - "inflate64==0.3.1", - "loralib==0.1.1", - "multivolumefile==0.2.3", - "mypy-extensions==1.0.0", - "nvidia-cublas-cu12==12.1.3.1", - "nvidia-cuda-cupti-cu12==12.1.105", - "nvidia-cuda-nvrtc-cu12==12.1.105", - "nvidia-cuda-runtime-cu12==12.1.105", - "nvidia-cudnn-cu12==8.9.2.26", - "nvidia-cufft-cu12==11.0.2.54", - "nvidia-curand-cu12==10.3.2.106", - "nvidia-cusolver-cu12==11.4.5.107", - "nvidia-cusolver-cu12==11.4.5.107", - "nvidia-cusparse-cu12==12.1.0.106", - "nvidia-nccl-cu12==2.19.3", - "nvidia-nvjitlink-cu12==12.3.101", - "nvidia-nvtx-cu12==12.1.105", - "pathspec==0.11.1", - "peft==0.4.0", - "py7zr==0.20.5", - "pybcj==1.0.1", - "pycryptodomex==3.18.0", - "pyppmd==1.0.0", - "pyzstd==0.15.9", - "safetensors==0.4.2", - "sagemaker_jumpstart_huggingface_script_utilities==1.2.7", - "sagemaker_jumpstart_script_utilities==1.1.9", - "scipy==1.11.1", - "shtab==1.7.1", - "termcolor==2.3.0", - "texttable==1.6.7", - "tokenize-rt==5.1.0", - "tokenizers==0.19.1", - "torch==2.2.0", - "transformers==4.43.1", - "triton==2.2.0", - "trl==0.8.1", - "typing-extensions==4.8.0", - "tyro==0.7.3" - ], - "training_vulnerabilities": [], - "deprecated": false, - "hyperparameters": [ - { - "name": "int8_quantization", - "type": "text", - "default": "False", - "options": [ - "True", - "False" - ], - "scope": "algorithm" - }, - { - "name": "enable_fsdp", - "type": "text", - "default": "True", - "options": [ - "True", - "False" - ], - "scope": "algorithm" - }, - { - "name": "epoch", - "type": "int", - "default": 1, - "min": 1, - "max": 1000, - "scope": "algorithm" - }, - { - "name": "learning_rate", - "type": "float", - "default": 0.0001, - "min": 1e-08, - "max": 1, - "scope": "algorithm" - }, - { - "name": "lora_r", - "type": "int", - "default": 8, - "min": 1, - "scope": "algorithm" - }, - { - "name": "lora_alpha", - "type": "int", - "default": 32, - "min": 1, - "scope": "algorithm" - }, - { - "name": "target_modules", - "type": "text", - "default": "q_proj,v_proj", - "scope": "algorithm" - }, - { - "name": "lora_dropout", - "type": "float", - "default": 0.05, - "min": 0, - "max": 1, - "scope": "algorithm" - }, - { - "name": "instruction_tuned", - "type": "text", - "default": "False", - "options": [ - "True", - "False" - ], - "scope": "algorithm" - }, - { - "name": "chat_dataset", - "type": "text", - "default": "True", - "options": [ - "True", - "False" - ], - "scope": "algorithm" - }, - { - "name": "add_input_output_demarcation_key", - "type": "text", - "default": "True", - "options": [ - "True", - "False" - ], - "scope": "algorithm" - }, - { - "name": "per_device_train_batch_size", - "type": "int", - "default": 1, - "min": 1, - "max": 1000, - "scope": "algorithm" - }, - { - "name": "per_device_eval_batch_size", - "type": "int", - "default": 1, - "min": 1, - "max": 1000, - "scope": "algorithm" - }, - { - "name": "max_train_samples", - "type": "int", - "default": -1, - "min": -1, - "scope": "algorithm" - }, - { - "name": "max_val_samples", - "type": "int", - "default": -1, - "min": -1, - "scope": "algorithm" - }, - { - "name": "seed", - "type": "int", - "default": 10, - "min": 1, - "max": 1000, - "scope": "algorithm" - }, - { - "name": "max_input_length", - "type": "int", - "default": -1, - "min": -1, - "scope": "algorithm" - }, - { - "name": "validation_split_ratio", - "type": "float", - "default": 0.2, - "min": 0, - "max": 1, - "scope": "algorithm" - }, - { - "name": "train_data_split_seed", - "type": "int", - "default": 0, - "min": 0, - "scope": "algorithm" - }, - { - "name": "preprocessing_num_workers", - "type": "text", - "default": "None", - "scope": "algorithm" - }, - { - "name": "sagemaker_submit_directory", - "type": "text", - "default": "/opt/ml/input/data/code/sourcedir.tar.gz", - "scope": "container" - }, - { - "name": "sagemaker_program", - "type": "text", - "default": "transfer_learning.py", - "scope": "container" - }, - { - "name": "sagemaker_container_log_level", - "type": "text", - "default": "20", - "scope": "container" - } - ], - "training_script_key": "source-directory-tarballs/training/meta-textgeneration/v1.2.0/sourcedir.tar.gz", - "training_prepacked_script_key": "source-directory-tarballs/training/meta-textgeneration/prepack/inference-meta-textgeneration/v1.2.0/sourcedir.tar.gz", - "training_prepacked_script_version": "1.2.0", - "training_ecr_specs": { - "framework": "huggingface", - "framework_version": "2.0.0", - "py_version": "py310", - "huggingface_transformers_version": "4.28.1" - }, - "training_artifact_key": "meta-training/v1.1.0/train-meta-textgeneration-llama-2-7b-f.tar.gz", - "inference_environment_variables": [ - { - "name": "SAGEMAKER_PROGRAM", - "type": "text", - "default": "inference.py", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_SUBMIT_DIRECTORY", - "type": "text", - "default": "/opt/ml/model/code", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", - "type": "text", - "default": "20", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", - "type": "text", - "default": "3600", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "ENDPOINT_SERVER_TIMEOUT", - "type": "int", - "default": 3600, - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MODEL_CACHE_ROOT", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_ENV", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "HF_MODEL_ID", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "OPTION_GPU_MEMORY_UTILIZATION", - "type": "text", - "default": "0.85", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SM_NUM_GPUS", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_INPUT_LENGTH", - "type": "text", - "default": "4095", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_TOTAL_TOKENS", - "type": "text", - "default": "4096", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_BATCH_PREFILL_TOKENS", - "type": "text", - "default": "8192", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_CONCURRENT_REQUESTS", - "type": "text", - "default": "512", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_MODEL_SERVER_WORKERS", - "type": "int", - "default": 1, - "scope": "container", - "required_for_model_class": true - } - ], - "metrics": [ - { - "Name": "huggingface-textgeneration:eval-loss", - "Regex": "eval_epoch_loss=tensor\\(([0-9\\.]+)" - }, - { - "Name": "huggingface-textgeneration:eval-ppl", - "Regex": "eval_ppl=tensor\\(([0-9\\.]+)" - }, - { - "Name": "huggingface-textgeneration:train-loss", - "Regex": "train_epoch_loss=([0-9\\.]+)" - } - ], - "default_inference_instance_type": "ml.g5.12xlarge", - "supported_inference_instance_types": [ - "ml.g5.12xlarge", - "ml.g5.24xlarge", - "ml.g5.2xlarge", - "ml.g5.48xlarge", - "ml.g5.4xlarge", - "ml.g5.8xlarge", - "ml.g6.12xlarge", - "ml.p4d.24xlarge" - ], - "default_training_instance_type": "ml.g5.12xlarge", - "supported_training_instance_types": [ - "ml.g5.12xlarge", - "ml.g5.24xlarge", - "ml.g5.48xlarge", - "ml.p3dn.24xlarge", - "ml.g4dn.12xlarge" - ], - "model_kwargs": {}, - "estimator_kwargs": { - "encrypt_inter_container_traffic": true, - "disable_output_compression": true, - "max_run": 360000 - }, - "fit_kwargs": {}, - "inference_volume_size": 256, - "training_volume_size": 256, - "inference_enable_network_isolation": true, - "training_enable_network_isolation": true, - "default_training_dataset_key": "training-datasets/oasst_top/train/", - "validation_supported": true, - "fine_tuning_supported": true, - "resource_name_base": "meta-textgeneration-llama-2-7b-f", - "gated_bucket": true, - "training_instance_type_variants": { - "regional_aliases": { - "af-south-1": { - "gpu_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-east-1": { - "gpu_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-northeast-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-northeast-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-northeast-3": { - "gpu_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-south-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-southeast-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-southeast-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ap-southeast-3": { - "gpu_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ca-central-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "ca-west-1": { - "gpu_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "cn-north-1": { - "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "cn-northwest-1": { - "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "eu-central-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "eu-north-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "eu-south-1": { - "gpu_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "eu-west-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "eu-west-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "eu-west-3": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "il-central-1": { - "gpu_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "me-central-1": { - "gpu_ecr_uri_1": "914824155844.dkr.ecr.me-central-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "me-south-1": { - "gpu_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "sa-east-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "us-east-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "us-east-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "us-gov-east-1": { - "gpu_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "us-gov-west-1": { - "gpu_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "us-west-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - }, - "us-west-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04" - } - }, - "variants": { - "g4dn": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - }, - "properties": { - "gated_model_key_env_var_value": "meta-training/g4dn/v1.0.0/train-meta-textgeneration-llama-2-7b-f.tar.gz" - } - }, - "g5": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - }, - "properties": { - "gated_model_key_env_var_value": "meta-training/g5/v1.0.0/train-meta-textgeneration-llama-2-7b-f.tar.gz" - } - }, - "g6": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "g6e": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "local_gpu": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p2": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p3": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p3dn": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - }, - "properties": { - "gated_model_key_env_var_value": "meta-training/p3dn/v1.0.0/train-meta-textgeneration-llama-2-7b-f.tar.gz" - } - }, - "p4d": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p4de": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p5": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p5e": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p5en": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p6": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p6e": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - } - } - }, - "hosting_artifact_s3_data_type": "S3Prefix", - "hosting_artifact_compression_type": "None", - "dynamic_container_deployment_supported": true, - "inference_configs": { - "tgi": { - "component_names": [ - "tgi" - ] - }, - "lmi": { - "component_names": [ - "lmi" - ], - "benchmark_metrics": { - "ml.g6.12xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.19", - "concurrency": "16" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "19.7", - "concurrency": "16" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.22", - "concurrency": "32" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "11.6", - "concurrency": "32" - } - ], - "ml.p4d.24xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "2.58", - "concurrency": "256" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "3448.3", - "concurrency": "256" - } - ] - } - }, - "lmi-optimized": { - "component_names": [ - "lmi-optimized" - ], - "benchmark_metrics": { - "ml.g5.12xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.23", - "concurrency": "1" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "156.2", - "concurrency": "1" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.25", - "concurrency": "2" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "93.1", - "concurrency": "2" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.27", - "concurrency": "4" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "58.2", - "concurrency": "4" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.29", - "concurrency": "8" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "31.0", - "concurrency": "8" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.42", - "concurrency": "16" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "15.2", - "concurrency": "16" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.58", - "concurrency": "32" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "8.0", - "concurrency": "32" - }, - { - "name": "latency", - "unit": "sec", - "value": "2.42", - "concurrency": "128" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "4.6", - "concurrency": "128" - } - ], - "ml.g5.2xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.19", - "concurrency": "1" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "66.9", - "concurrency": "1" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.19", - "concurrency": "2" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "55.5", - "concurrency": "2" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.22", - "concurrency": "4" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "41.8", - "concurrency": "4" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.44", - "concurrency": "8" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "31.3", - "concurrency": "8" - }, - { - "name": "latency", - "unit": "sec", - "value": "2.87", - "concurrency": "16" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "71.1", - "concurrency": "16" - } - ], - "ml.g6.12xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.16", - "concurrency": "1" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "107.1", - "concurrency": "1" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.17", - "concurrency": "2" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "79.5", - "concurrency": "2" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.19", - "concurrency": "4" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "55.1", - "concurrency": "4" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.21", - "concurrency": "8" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "34.4", - "concurrency": "8" - }, - { - "name": "latency", - "unit": "sec", - "value": "3.75", - "concurrency": "64" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "8.3", - "concurrency": "64" - } - ], - "ml.g6.2xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.23", - "concurrency": "1" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "38.2", - "concurrency": "1" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.30", - "concurrency": "2" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "32.9", - "concurrency": "2" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.30", - "concurrency": "4" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "24.5", - "concurrency": "4" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.60", - "concurrency": "8" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "21.0", - "concurrency": "8" - }, - { - "name": "latency", - "unit": "sec", - "value": "4.19", - "concurrency": "16" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "50.0", - "concurrency": "16" - } - ], - "ml.p4d.24xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.06", - "concurrency": "1" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "150.2", - "concurrency": "1" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.06", - "concurrency": "2" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "149.0", - "concurrency": "2" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.06", - "concurrency": "4" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "149.0", - "concurrency": "4" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.06", - "concurrency": "8" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "141.0", - "concurrency": "8" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.06", - "concurrency": "16" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "128.9", - "concurrency": "16" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.06", - "concurrency": "32" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "105.2", - "concurrency": "32" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.07", - "concurrency": "64" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "73.9", - "concurrency": "64" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.37", - "concurrency": "128" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "68.4", - "concurrency": "128" - }, - { - "name": "latency", - "unit": "sec", - "value": "4.58", - "concurrency": "512" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "11111.1", - "concurrency": "512" - } - ], - "ml.p5.48xlarge": [ - { - "name": "latency", - "unit": "sec", - "value": "0.04", - "concurrency": "1" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "253.2", - "concurrency": "1" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.03", - "concurrency": "2" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "257.1", - "concurrency": "2" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.03", - "concurrency": "4" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "252.5", - "concurrency": "4" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.03", - "concurrency": "8" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "236.4", - "concurrency": "8" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.04", - "concurrency": "16" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "213.2", - "concurrency": "16" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.04", - "concurrency": "32" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "178.6", - "concurrency": "32" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.04", - "concurrency": "64" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "129.0", - "concurrency": "64" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.04", - "concurrency": "128" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "81.2", - "concurrency": "128" - }, - { - "name": "latency", - "unit": "sec", - "value": "0.33", - "concurrency": "256" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "74.5", - "concurrency": "256" - }, - { - "name": "latency", - "unit": "sec", - "value": "1.77", - "concurrency": "512" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "724.6", - "concurrency": "512" - }, - { - "name": "latency", - "unit": "sec", - "value": "2.96", - "concurrency": "768" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "6666.7", - "concurrency": "768" - }, - { - "name": "latency", - "unit": "sec", - "value": "2.22", - "concurrency": "1024" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "5882.4", - "concurrency": "1024" - }, - { - "name": "latency", - "unit": "sec", - "value": "3.88", - "concurrency": "1280" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "11111.1", - "concurrency": "1280" - }, - { - "name": "latency", - "unit": "sec", - "value": "3.99", - "concurrency": "1536" - }, - { - "name": "throughput", - "unit": "tokens/sec", - "value": "11111.1", - "concurrency": "1536" - } - ] - }, - "acceleration_configs": [ - { - "type": "Compilation", - "enabled": false - }, - { - "type": "Speculative-Decoding", - "enabled": true - }, - { - "type": "Quantization", - "enabled": false - } - ] - }, - "neuron": { - "component_names": [ - "neuron" - ] - } - }, - "inference_config_components": { - "tgi": { - "hosting_ecr_specs": { - "framework": "huggingface-llm", - "framework_version": "2.0.0", - "py_version": "py310" - }, - "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", - "hosting_use_script_uri": false, - "inference_dependencies": [], - "inference_vulnerable": false, - "inference_vulnerabilities": [], - "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", - "hosting_prepacked_artifact_version": "1.1.0", - "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", - "hosting_artifact_s3_data_type": "S3Prefix", - "hosting_artifact_compression_type": "None", - "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", - "hosting_neuron_model_version": "1.0.0", - "model_kwargs": {}, - "deploy_kwargs": { - "model_data_download_timeout": 1200, - "container_startup_health_check_timeout": 1200 - }, - "predictor_specs": { - "supported_content_types": [ - "application/json" - ], - "supported_accept_types": [ - "application/json" - ], - "default_content_type": "application/json", - "default_accept_type": "application/json" - }, - "default_inference_instance_type": "ml.g5.12xlarge", - "supported_inference_instance_types": [ - "ml.g5.12xlarge", - "ml.g5.24xlarge", - "ml.g5.2xlarge", - "ml.g5.48xlarge", - "ml.g5.4xlarge", - "ml.g5.8xlarge", - "ml.g6.12xlarge", - "ml.p4d.24xlarge" - ], - "hosting_instance_type_variants": { - "regional_aliases": { - "af-south-1": { - "gpu_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-east-1": { - "gpu_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-east-2": { - "gpu_ecr_uri_1": "975050140332.dkr.ecr.ap-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-northeast-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-northeast-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-northeast-3": { - "gpu_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-south-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-south-2": { - "gpu_ecr_uri_1": "772153158452.dkr.ecr.ap-south-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-southeast-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-southeast-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-southeast-3": { - "gpu_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-southeast-4": { - "gpu_ecr_uri_1": "457447274322.dkr.ecr.ap-southeast-4.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-southeast-5": { - "gpu_ecr_uri_1": "550225433462.dkr.ecr.ap-southeast-5.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ap-southeast-7": { - "gpu_ecr_uri_1": "590183813437.dkr.ecr.ap-southeast-7.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ca-central-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "ca-west-1": { - "gpu_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "cn-north-1": { - "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "cn-northwest-1": { - "gpu_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-central-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-central-2": { - "gpu_ecr_uri_1": "380420809688.dkr.ecr.eu-central-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-north-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-south-1": { - "gpu_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-south-2": { - "gpu_ecr_uri_1": "503227376785.dkr.ecr.eu-south-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-west-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-west-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "eu-west-3": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "il-central-1": { - "gpu_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "me-central-1": { - "gpu_ecr_uri_1": "914824155844.dkr.ecr.me-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "me-south-1": { - "gpu_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "mx-central-1": { - "gpu_ecr_uri_1": "637423239942.dkr.ecr.mx-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "sa-east-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "us-east-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "us-east-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "us-gov-east-1": { - "gpu_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "us-gov-west-1": { - "gpu_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "us-west-1": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - }, - "us-west-2": { - "gpu_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.1-tgi2.0.0-gpu-py310-cu121-ubuntu22.04" - } - }, - "variants": { - "g4dn": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "g5": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "g6": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "g6e": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "local_gpu": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p2": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p3": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p3dn": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p4d": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p4de": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p5": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p5e": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p5en": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p6": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "p6e": { - "regional_properties": { - "image_uri": "$gpu_ecr_uri_1" - } - }, - "ml.g5.12xlarge": { - "properties": { - "environment_variables": { - "SM_NUM_GPUS": "4", - "MAX_BATCH_PREFILL_TOKENS": "16384" - }, - "resource_requirements": { - "min_memory_mb": 98304, - "num_accelerators": 4 - } - } - }, - "ml.g5.24xlarge": { - "properties": { - "environment_variables": { - "SM_NUM_GPUS": "4" - }, - "resource_requirements": { - "min_memory_mb": 196608, - "num_accelerators": 4 - } - } - }, - "ml.g5.48xlarge": { - "properties": { - "environment_variables": { - "SM_NUM_GPUS": "8" - }, - "resource_requirements": { - "min_memory_mb": 393216, - "num_accelerators": 8 - } - } - }, - "ml.p4d.24xlarge": { - "properties": { - "environment_variables": { - "SM_NUM_GPUS": "8", - "MAX_BATCH_PREFILL_TOKENS": "16384" - }, - "resource_requirements": { - "min_memory_mb": 589824, - "num_accelerators": 8 - } - } - }, - "ml.p5.48xlarge": { - "properties": { - "environment_variables": { - "OPTION_GPU_MEMORY_UTILIZATION": "0.95" - } - } - }, - "ml.g5.2xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 16384, - "num_accelerators": 1 - } - } - }, - "ml.g5.4xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 32768, - "num_accelerators": 1 - } - } - }, - "ml.g5.8xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 65536, - "num_accelerators": 1 - } - } - } - } - }, - "inference_volume_size": 256, - "inference_enable_network_isolation": true, - "hosting_resource_requirements": { - "min_memory_mb": 98304, - "num_accelerators": 4 - }, - "inference_environment_variables": [ - { - "name": "SAGEMAKER_PROGRAM", - "type": "text", - "default": "inference.py", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_SUBMIT_DIRECTORY", - "type": "text", - "default": "/opt/ml/model/code", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", - "type": "text", - "default": "20", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", - "type": "text", - "default": "3600", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "ENDPOINT_SERVER_TIMEOUT", - "type": "int", - "default": 3600, - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MODEL_CACHE_ROOT", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_ENV", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "HF_MODEL_ID", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "OPTION_GPU_MEMORY_UTILIZATION", - "type": "text", - "default": "0.85", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SM_NUM_GPUS", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_INPUT_LENGTH", - "type": "text", - "default": "4095", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_TOTAL_TOKENS", - "type": "text", - "default": "4096", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_BATCH_PREFILL_TOKENS", - "type": "text", - "default": "8192", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MAX_CONCURRENT_REQUESTS", - "type": "text", - "default": "512", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_MODEL_SERVER_WORKERS", - "type": "int", - "default": 1, - "scope": "container", - "required_for_model_class": true - } - ], - "default_payloads": { - "pingExponentialBackoff": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text", - "input_logprobs": "[0].details.prefill[*].logprob" - }, - "body": { - "inputs": "import socket\n\ndef ping_exponential_backoff(host: str):", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.2, - "decoder_input_details": true, - "details": true - } - } - }, - "argparse": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text" - }, - "body": { - "inputs": "import argparse\n\ndef main(string: str):\n print(string)\n print(string[::-1])\n\nif __name__ == \"__main__\":", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05 - } - } - }, - "Fibonacci": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text", - "input_logprobs": "[0].details.prefill[*].logprob" - }, - "body": { - "inputs": "def fib(n):\n", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.2, - "decoder_input_details": true, - "details": true - } - } - }, - "removeNonAscii": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text", - "input_logprobs": "[0].details.prefill[*].logprob" - }, - "body": { - "inputs": "def remove_non_ascii(s: str) -> str:\n \"\"\"\n return result\n", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05, - "decoder_input_details": true, - "details": true - } - } - }, - "installationInstructions": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text" - }, - "body": { - "inputs": "# Installation instructions:\n ```bash\n\n ```\nThis downloads the LLaMA inference code and installs the repository as a local pip package.\n", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05 - } - } - }, - "interfaceManager": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text" - }, - "body": { - "inputs": "class InterfaceManagerFactory(AbstractManagerFactory):\n def __init__(\ndef main():\n factory = InterfaceManagerFactory(start=datetime.now())\n managers = []\n for i in range(10):\n managers.append(factory.build(id=i))\n", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05 - } - } - }, - "quasiPrefunctoid": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text" - }, - "body": { - "inputs": "/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/\ntheorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :\n π₁ P = 0 ↔ = 0 :=\nbegin\n split,\n { intros h f,\n rw pi_1_etalisation at h,\n simp [h],\n refl\n },\n { intro h,\n have := @quasi_adjoint C D P,\n simp [←pi_1_etalisation, this, h],\n refl\n }\nend\n", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05 - } - } - }, - "bashListTextFiles": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text", - "input_logprobs": "[0].details.prefill[*].logprob" - }, - "body": { - "inputs": "[INST] In Bash, how do I list all text files in the current directory (excluding subdirectories) that have been modified in the last month? [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05, - "decoder_input_details": true, - "details": true - } - } - }, - "inorderPreorderTraversal": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text" - }, - "body": { - "inputs": "[INST] What is the difference between inorder and preorder traversal? Give an example in Python. [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05 - } - } - }, - "contiguousSublists": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "[0].generated_text" - }, - "body": { - "inputs": "[INST] <>\nProvide answers in JavaScript\n<>\n\nWrite a function that computes the set of sums of all contiguous sublists of a given list. [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.05 - } - } - } - } - }, - "lmi": { - "hosting_ecr_specs": { - "framework": "djl-deepspeed", - "framework_version": "0.27.0", - "py_version": "py310" - }, - "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", - "hosting_use_script_uri": false, - "inference_dependencies": [], - "inference_vulnerable": false, - "inference_vulnerabilities": [], - "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", - "hosting_prepacked_artifact_version": "1.1.0", - "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", - "hosting_artifact_s3_data_type": "S3Prefix", - "hosting_artifact_compression_type": "None", - "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", - "hosting_neuron_model_version": "1.0.0", - "model_kwargs": {}, - "deploy_kwargs": { - "model_data_download_timeout": 1200, - "container_startup_health_check_timeout": 1200 - }, - "predictor_specs": { - "supported_content_types": [ - "application/json" - ], - "supported_accept_types": [ - "application/json" - ], - "default_content_type": "application/json", - "default_accept_type": "application/json" - }, - "default_inference_instance_type": "ml.g5.12xlarge", - "supported_inference_instance_types": [ - "ml.g5.12xlarge", - "ml.g5.24xlarge", - "ml.g5.2xlarge", - "ml.g5.48xlarge", - "ml.g5.4xlarge", - "ml.g5.8xlarge", - "ml.g6.12xlarge", - "ml.p4d.24xlarge" - ], - "hosting_instance_type_variants": { - "regional_aliases": { - "af-south-1": { - "alias_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-east-1": { - "alias_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-northeast-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-northeast-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-northeast-3": { - "alias_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-south-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-southeast-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-southeast-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ap-southeast-3": { - "alias_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ca-central-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "ca-west-1": { - "alias_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "cn-north-1": { - "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "cn-northwest-1": { - "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "eu-central-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "eu-north-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "eu-south-1": { - "alias_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "eu-west-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "eu-west-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "eu-west-3": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "il-central-1": { - "alias_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "me-south-1": { - "alias_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "sa-east-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "us-east-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "us-east-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "us-gov-east-1": { - "alias_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "us-gov-west-1": { - "alias_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "us-west-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - }, - "us-west-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.27.0-deepspeed0.12.6-cu121" - } - }, - "variants": { - "g4dn": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "g5": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "g6": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "g6e": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "local_gpu": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p2": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p3": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p3dn": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p4d": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p4de": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p5": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p5e": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p5en": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p6": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p6e": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "ml.p4d.24xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "1" - }, - "resource_requirements": { - "min_memory_mb": 589824, - "num_accelerators": 8 - } - } - }, - "ml.p5.48xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "1", - "OPTION_GPU_MEMORY_UTILIZATION": "0.95" - } - } - }, - "ml.g5.2xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 16384, - "num_accelerators": 1 - } - } - }, - "ml.g5.4xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 32768, - "num_accelerators": 1 - } - } - }, - "ml.g5.8xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 65536, - "num_accelerators": 1 - } - } - }, - "ml.g5.12xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 98304, - "num_accelerators": 4 - } - } - }, - "ml.g5.24xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 196608, - "num_accelerators": 4 - } - } - }, - "ml.g5.48xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 393216, - "num_accelerators": 8 - } - } - } - } - }, - "inference_volume_size": 256, - "inference_enable_network_isolation": true, - "hosting_resource_requirements": { - "min_memory_mb": 98304, - "num_accelerators": 4 - }, - "inference_environment_variables": [ - { - "name": "SAGEMAKER_PROGRAM", - "type": "text", - "default": "inference.py", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_SUBMIT_DIRECTORY", - "type": "text", - "default": "/opt/ml/model/code", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", - "type": "text", - "default": "20", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", - "type": "text", - "default": "3600", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "ENDPOINT_SERVER_TIMEOUT", - "type": "int", - "default": 3600, - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MODEL_CACHE_ROOT", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_ENV", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "HF_MODEL_ID", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "OPTION_GPU_MEMORY_UTILIZATION", - "type": "text", - "default": "0.85", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_MODEL_SERVER_WORKERS", - "type": "int", - "default": 1, - "scope": "container", - "required_for_model_class": true - } - ], - "default_payloads": { - "meaningOfLife": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "I believe the meaning of life is", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6, - "decoder_input_details": true, - "details": true - } - } - }, - "theoryOfRelativity": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "Simply put, the theory of relativity states that ", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "teamMessage": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "A brief message congratulating the team on the launch:\n\nHi everyone,\n\nI just ", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "englishToFrench": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "Translate English to French:\nsea otter => loutre de mer\npeppermint => menthe poivrée\nplush girafe => girafe peluche\ncheese =>", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - } - } - }, - "lmi-optimized": { - "hosting_ecr_specs": { - "framework": "djl-lmi", - "framework_version": "0.28.0", - "py_version": "py310" - }, - "hosting_script_key": "source-directory-tarballs/meta/inference/textgeneration/v1.2.3/sourcedir.tar.gz", - "hosting_use_script_uri": false, - "inference_dependencies": [], - "inference_vulnerable": false, - "inference_vulnerabilities": [], - "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference/v1.1.0/", - "hosting_prepacked_artifact_version": "1.1.0", - "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/inference-prepack/v1.1.0/", - "hosting_artifact_s3_data_type": "S3Prefix", - "hosting_artifact_compression_type": "None", - "hosting_additional_data_sources": { - "speculative_decoding": [ - { - "channel_name": "draft_model", - "artifact_version": "v2", - "s3_data_source": { - "compression_type": "None", - "s3_data_type": "S3Prefix", - "s3_uri": "sagemaker-speculative-decoding-llama2-tiny-v2/" - } - } - ] - }, - "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", - "hosting_neuron_model_version": "1.0.0", - "model_kwargs": {}, - "deploy_kwargs": { - "model_data_download_timeout": 1200, - "container_startup_health_check_timeout": 1200 - }, - "predictor_specs": { - "supported_content_types": [ - "application/json" - ], - "supported_accept_types": [ - "application/json" - ], - "default_content_type": "application/json", - "default_accept_type": "application/json" - }, - "default_inference_instance_type": "ml.p4d.24xlarge", - "supported_inference_instance_types": [ - "ml.g5.12xlarge", - "ml.g5.2xlarge", - "ml.g6.12xlarge", - "ml.g6.2xlarge", - "ml.p4d.24xlarge", - "ml.p4de.24xlarge", - "ml.p5.48xlarge" - ], - "hosting_instance_type_variants": { - "regional_aliases": { - "af-south-1": { - "alias_ecr_uri_1": "626614931356.dkr.ecr.af-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-east-1": { - "alias_ecr_uri_1": "871362719292.dkr.ecr.ap-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-northeast-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-northeast-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-northeast-3": { - "alias_ecr_uri_1": "364406365360.dkr.ecr.ap-northeast-3.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-south-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-southeast-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-southeast-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ap-southeast-3": { - "alias_ecr_uri_1": "907027046896.dkr.ecr.ap-southeast-3.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ca-central-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ca-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "ca-west-1": { - "alias_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "cn-north-1": { - "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-north-1.amazonaws.com.cn/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "cn-northwest-1": { - "alias_ecr_uri_1": "727897471807.dkr.ecr.cn-northwest-1.amazonaws.com.cn/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "eu-central-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "eu-north-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-north-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "eu-south-1": { - "alias_ecr_uri_1": "692866216735.dkr.ecr.eu-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "eu-west-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "eu-west-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "eu-west-3": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "il-central-1": { - "alias_ecr_uri_1": "780543022126.dkr.ecr.il-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "me-central-1": { - "alias_ecr_uri_1": "914824155844.dkr.ecr.me-central-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "me-south-1": { - "alias_ecr_uri_1": "217643126080.dkr.ecr.me-south-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "sa-east-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "us-east-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "us-east-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "us-gov-east-1": { - "alias_ecr_uri_1": "446045086412.dkr.ecr.us-gov-east-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "us-gov-west-1": { - "alias_ecr_uri_1": "442386744353.dkr.ecr.us-gov-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "us-west-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-1.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - }, - "us-west-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.28.0-lmi10.0.0-cu124" - } - }, - "variants": { - "g4dn": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "g5": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "g6": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "g6e": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "local_gpu": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p2": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p3": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p3dn": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p4d": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p4de": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p5": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p5e": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p5en": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p6": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "p6e": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "ml.p4d.24xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "1" - }, - "resource_requirements": { - "min_memory_mb": 589824, - "num_accelerators": 8 - } - } - }, - "ml.p5.48xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "1", - "OPTION_GPU_MEMORY_UTILIZATION": "0.95" - } - } - }, - "ml.p4de.24xlarge": { - "properties": { - "resource_requirements": { - "min_memory_mb": 589824, - "num_accelerators": 8 - } - } - } - } - }, - "inference_volume_size": 256, - "inference_enable_network_isolation": true, - "hosting_resource_requirements": { - "min_memory_mb": 589824, - "num_accelerators": 8 - }, - "inference_environment_variables": [ - { - "name": "SAGEMAKER_PROGRAM", - "type": "text", - "default": "inference.py", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_SUBMIT_DIRECTORY", - "type": "text", - "default": "/opt/ml/model/code", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", - "type": "text", - "default": "20", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", - "type": "text", - "default": "3600", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "ENDPOINT_SERVER_TIMEOUT", - "type": "int", - "default": 3600, - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MODEL_CACHE_ROOT", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_ENV", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "HF_MODEL_ID", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "OPTION_SPECULATIVE_DRAFT_MODEL", - "type": "text", - "default": "/opt/ml/additional-model-data-sources/draft_model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "OPTION_GPU_MEMORY_UTILIZATION", - "type": "text", - "default": "0.85", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_MODEL_SERVER_WORKERS", - "type": "int", - "default": 1, - "scope": "container", - "required_for_model_class": true - } - ], - "default_payloads": { - "meaningOfLife": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "I believe the meaning of life is", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "theoryOfRelativity": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "Simply put, the theory of relativity states that ", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "teamMessage": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "A brief message congratulating the team on the launch:\n\nHi everyone,\n\nI just ", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "englishToFrench": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "Translate English to French:\nsea otter => loutre de mer\npeppermint => menthe poivrée\nplush girafe => girafe peluche\ncheese =>", - "parameters": { - "max_new_tokens": 64, - "top_p": 0.9, - "temperature": 0.6 - } - } - } - } - }, - "neuron": { - "hosting_ecr_specs": { - "framework": "djl-neuronx", - "framework_version": "0.24.0", - "py_version": "py39" - }, - "hosting_script_key": "source-directory-tarballs/meta/inference/textgenerationneuron/v1.0.0/sourcedir.tar.gz", - "hosting_use_script_uri": false, - "inference_dependencies": [ - "sagemaker_jumpstart_huggingface_script_utilities==1.0.8", - "sagemaker_jumpstart_script_utilities==1.1.8" - ], - "inference_vulnerable": false, - "inference_vulnerabilities": [], - "hosting_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/neuron/inference/v1.0.0/", - "hosting_prepacked_artifact_version": "1.0.0", - "hosting_prepacked_artifact_key": "meta-textgeneration/meta-textgeneration-llama-2-7b-f/artifacts/neuron/inference-prepack/v1.0.0/", - "hosting_artifact_s3_data_type": "S3Prefix", - "hosting_artifact_compression_type": "None", - "hosting_neuron_model_id": "meta-textgeneration-llama-2-7b-f", - "hosting_neuron_model_version": "1.0.0", - "model_kwargs": {}, - "deploy_kwargs": { - "model_data_download_timeout": 3600, - "container_startup_health_check_timeout": 3600 - }, - "predictor_specs": { - "supported_content_types": [ - "application/json" - ], - "supported_accept_types": [ - "application/json" - ], - "default_content_type": "application/json", - "default_accept_type": "application/json" - }, - "default_inference_instance_type": "ml.inf2.xlarge", - "supported_inference_instance_types": [ - "ml.inf2.xlarge", - "ml.inf2.8xlarge", - "ml.inf2.24xlarge", - "ml.inf2.48xlarge" - ], - "hosting_instance_type_variants": { - "regional_aliases": { - "ap-northeast-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-northeast-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "ap-south-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-south-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "ap-southeast-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "ap-southeast-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.ap-southeast-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "ap-southeast-5": { - "alias_ecr_uri_1": "550225433462.dkr.ecr.ap-southeast-5.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "ap-southeast-7": { - "alias_ecr_uri_1": "590183813437.dkr.ecr.ap-southeast-7.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "ca-west-1": { - "alias_ecr_uri_1": "204538143572.dkr.ecr.ca-west-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "eu-central-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "eu-central-2": { - "alias_ecr_uri_1": "380420809688.dkr.ecr.eu-central-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "eu-west-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "eu-west-3": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.eu-west-3.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "mx-central-1": { - "alias_ecr_uri_1": "637423239942.dkr.ecr.mx-central-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "sa-east-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.sa-east-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "us-east-1": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "us-east-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-east-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - }, - "us-west-2": { - "alias_ecr_uri_1": "763104351884.dkr.ecr.us-west-2.amazonaws.com/djl-inference:0.24.0-neuronx-sdk2.14.1" - } - }, - "variants": { - "inf2": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "trn1": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "trn1n": { - "regional_properties": { - "image_uri": "$alias_ecr_uri_1" - } - }, - "ml.inf2.xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "2", - "OPTION_N_POSITIONS": "1024", - "OPTION_DTYPE": "fp16", - "OPTION_ROLLING_BATCH": "auto", - "OPTION_MAX_ROLLING_BATCH_SIZE": "1", - "OPTION_NEURON_OPTIMIZE_LEVEL": "2" - }, - "resource_requirements": { - "min_memory_mb": 8192, - "num_accelerators": 1 - } - } - }, - "ml.inf2.8xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "2", - "OPTION_N_POSITIONS": "2048", - "OPTION_DTYPE": "fp16", - "OPTION_ROLLING_BATCH": "auto", - "OPTION_MAX_ROLLING_BATCH_SIZE": "4", - "OPTION_NEURON_OPTIMIZE_LEVEL": "2" - }, - "resource_requirements": { - "min_memory_mb": 65536, - "num_accelerators": 1 - } - } - }, - "ml.inf2.24xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "12", - "OPTION_N_POSITIONS": "4096", - "OPTION_DTYPE": "fp16", - "OPTION_ROLLING_BATCH": "auto", - "OPTION_MAX_ROLLING_BATCH_SIZE": "4", - "OPTION_NEURON_OPTIMIZE_LEVEL": "2" - }, - "resource_requirements": { - "min_memory_mb": 196608, - "num_accelerators": 6 - } - } - }, - "ml.inf2.48xlarge": { - "properties": { - "environment_variables": { - "OPTION_TENSOR_PARALLEL_DEGREE": "24", - "OPTION_N_POSITIONS": "4096", - "OPTION_DTYPE": "fp16", - "OPTION_ROLLING_BATCH": "auto", - "OPTION_MAX_ROLLING_BATCH_SIZE": "4", - "OPTION_NEURON_OPTIMIZE_LEVEL": "2" - }, - "resource_requirements": { - "min_memory_mb": 393216, - "num_accelerators": 12 - } - } - } - } - }, - "inference_volume_size": 256, - "inference_enable_network_isolation": false, - "hosting_resource_requirements": { - "min_memory_mb": 8192, - "num_accelerators": 1 - }, - "inference_environment_variables": [ - { - "name": "SAGEMAKER_PROGRAM", - "type": "text", - "default": "inference.py", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_SUBMIT_DIRECTORY", - "type": "text", - "default": "/opt/ml/model/code", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_CONTAINER_LOG_LEVEL", - "type": "text", - "default": "20", - "scope": "container", - "required_for_model_class": false - }, - { - "name": "SAGEMAKER_MODEL_SERVER_TIMEOUT", - "type": "text", - "default": "3600", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "ENDPOINT_SERVER_TIMEOUT", - "type": "int", - "default": 3600, - "scope": "container", - "required_for_model_class": true - }, - { - "name": "MODEL_CACHE_ROOT", - "type": "text", - "default": "/opt/ml/model", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_ENV", - "type": "text", - "default": "1", - "scope": "container", - "required_for_model_class": true - }, - { - "name": "SAGEMAKER_MODEL_SERVER_WORKERS", - "type": "int", - "default": 1, - "scope": "container", - "required_for_model_class": true - } - ], - "default_payloads": { - "mayonnaise": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "[INST] what is the recipe of mayonnaise? [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "parisTrip": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "[INST] I am going to Paris, what should I see? [/INST] Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris:\n\n1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city.\n2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa.\n3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows.\n\nThese are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world.[INST] What is so great about #1? [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "parisHaiku": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "[INST] <>\nAlways answer with Haiku\n<>\n\nI am going to Paris, what should I see? [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.6 - } - } - }, - "emojisBeijing": { - "content_type": "application/json", - "prompt_key": "inputs", - "output_keys": { - "generated_text": "generated_text" - }, - "body": { - "inputs": "[INST] <>\nAlways answer with detailed instruction\n<>\n\nHow to go from Beijing to NY? [/INST] ", - "parameters": { - "max_new_tokens": 256, - "top_p": 0.9, - "temperature": 0.6 - } - } - } - } - } - }, - "inference_config_rankings": { - "overall": { - "description": "default", - "rankings": [ - "tgi", - "lmi", - "lmi-optimized", - "neuron" - ] - } - }, - "hosting_neuron_model_id": "meta-textgenerationneuron-llama-2-7b-f", - "hosting_neuron_model_version": "1.0.0" -} \ No newline at end of file diff --git a/test_unified_model_card.py b/test_unified_model_card.py deleted file mode 100644 index 475d46afdb..0000000000 --- a/test_unified_model_card.py +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 - -import json -import os -import sys -import boto3 -import time -from datetime import datetime -# from urllib.parse import urlparse -from unittest.mock import patch - -os.environ['HUGGING_FACE_HUB_TOKEN'] = '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' -os.environ['AWS_DEFAULT_REGION'] = 'us-west-2' - -sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) - -from sagemaker.jumpstart.model import JumpStartModel -from sagemaker.jumpstart.types import JumpStartModelSpecs -from sagemaker.jumpstart.enums import JumpStartModelType - - -def check_aws_account(): - """Check which AWS account and region we're using.""" - try: - sts_client = boto3.client('sts') - identity = sts_client.get_caller_identity() - - account_id = identity['Account'] - user_arn = identity['Arn'] - region = boto3.Session().region_name or 'us-west-2' - - print(f" AWS Account: {account_id}") - print(f" User/Role: {user_arn}") - print(f" Region: {region}") - print() - - return account_id, region - except Exception as e: - print(f" Error checking AWS account: {e}") - return None, None - - -def monitor_endpoint(endpoint_name, region='us-west-2'): - """Monitor endpoint deployment progress.""" - sagemaker_client = boto3.client('sagemaker', region_name=region) - - print(f" Monitoring endpoint: {endpoint_name}") - start_time = time.time() - - while True: - try: - response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) - status = response['EndpointStatus'] - elapsed = int(time.time() - start_time) - - print(f" [{elapsed//60}m {elapsed%60}s] {endpoint_name}: {status}") - - if status == 'InService': - print(f" {endpoint_name} is ready! (took {elapsed//60}m {elapsed%60}s)") - break - elif status == 'Failed': - print(f" {endpoint_name} deployment failed!") - print(f"Failure reason: {response.get('FailureReason', 'Unknown')}") - break - - except Exception as e: - print(f"Error checking {endpoint_name}: {e}") - - time.sleep(30) # Check every 30 seconds - -def load_custom_spec(): - """Load the custom spec file from src/sagemaker directory.""" - spec_path = os.path.join(os.path.dirname(__file__), 'specfileex') - with open(spec_path, 'r') as f: - return json.load(f) - - -# Check AWS account -account_id, region = check_aws_account() - -custom_spec = load_custom_spec() -mock_specs = JumpStartModelSpecs(custom_spec) - -with patch('sagemaker.jumpstart.cache.JumpStartModelsCache.get_specs') as mock_get_specs, \ - patch('sagemaker.jumpstart.utils.validate_model_id_and_get_type') as mock_validate_model: - - mock_get_specs.return_value = mock_specs - mock_validate_model.return_value = JumpStartModelType.OPEN_WEIGHTS - - model_id = "meta-textgeneration-llama-2-7b-f" - model_version = "4.19.0" - accept_eula = False - - # Create unique endpoint names with timestamp - timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - neuron_endpoint_name = f"llama-neuron-{timestamp}" - gpu_endpoint_name = f"llama-gpu-{timestamp}" - - print(f" Neuron endpoint: {neuron_endpoint_name}") - print(f" GPU endpoint: {gpu_endpoint_name}") - print() - - - model_neuron = JumpStartModel( - model_id=model_id, - model_version=model_version, - instance_type="ml.inf2.24xlarge", - env={"HUGGING_FACE_HUB_TOKEN": "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"} - ) - - # Modify to use alpha us-west-2 bucket - original_neuron_uri = model_neuron.model_data['S3DataSource']['S3Uri'] - # Replace with alpha us-west-2 bucket (handle both east-1 and west-2 original buckets) - alpha_neuron_uri = original_neuron_uri.replace('jumpstart-private-cache-prod-us-east-1', 'jumpstart-private-cache-alpha-us-west-2') - alpha_neuron_uri = alpha_neuron_uri.replace('jumpstart-private-cache-prod-us-west-2', 'jumpstart-private-cache-alpha-us-west-2') - # Also handle regular cache buckets (without "private") - alpha_neuron_uri = alpha_neuron_uri.replace('jumpstart-cache-prod-us-east-1', 'jumpstart-cache-alpha-us-west-2') - alpha_neuron_uri = alpha_neuron_uri.replace('jumpstart-cache-prod-us-west-2', 'jumpstart-cache-alpha-us-west-2') - model_neuron.model_data['S3DataSource']['S3Uri'] = alpha_neuron_uri - print(f"Original neuron URI: {original_neuron_uri}") - print(f"Alpha neuron URI: {alpha_neuron_uri}") - print(model_neuron.model_data) - neuron_location = model_neuron.model_data['S3DataSource']['S3Uri'] - print(f"Neuron location: {neuron_location}") - - print("Deploying neuron model...") - neuron_predictor = model_neuron.deploy( - initial_instance_count=1, - instance_type="ml.inf2.24xlarge", - endpoint_name=neuron_endpoint_name, - accept_eula=True, - wait=False - ) - - # Monitor neuron deployment - monitor_endpoint(neuron_endpoint_name, 'us-west-2') - - - - model_gpu = JumpStartModel( - model_id=model_id, - model_version=model_version, - instance_type="ml.g5.12xlarge", - env={"HUGGING_FACE_HUB_TOKEN": "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"} - ) - - # Modify to use alpha us-west-2 bucket - original_gpu_uri = model_gpu.model_data['S3DataSource']['S3Uri'] - # Replace with alpha us-west-2 bucket (handle both east-1 and west-2 original buckets) - alpha_gpu_uri = original_gpu_uri.replace('jumpstart-private-cache-prod-us-east-1', 'jumpstart-private-cache-alpha-us-west-2') - alpha_gpu_uri = alpha_gpu_uri.replace('jumpstart-private-cache-prod-us-west-2', 'jumpstart-private-cache-alpha-us-west-2') - # Also handle regular cache buckets (without "private") - alpha_gpu_uri = alpha_gpu_uri.replace('jumpstart-cache-prod-us-east-1', 'jumpstart-cache-alpha-us-west-2') - alpha_gpu_uri = alpha_gpu_uri.replace('jumpstart-cache-prod-us-west-2', 'jumpstart-cache-alpha-us-west-2') - model_gpu.model_data['S3DataSource']['S3Uri'] = alpha_gpu_uri - print(f"Original GPU URI: {original_gpu_uri}") - print(f"Alpha GPU URI: {alpha_gpu_uri}") - print(model_gpu.model_data) - gpu_location = model_gpu.model_data['S3DataSource']['S3Uri'] - print(f"GPU location: {gpu_location}") - - print("Deploying GPU model...") - gpu_predictor = model_gpu.deploy( - initial_instance_count=1, - instance_type="ml.g5.12xlarge", - endpoint_name=gpu_endpoint_name, - accept_eula=True, - wait=False - ) - - # Monitor GPU deployment - monitor_endpoint(gpu_endpoint_name, 'us-west-2') - - test_payload = { - "inputs": "The meaning of life is", - "parameters": { - "max_new_tokens": 50, - "temperature": 0.7 - } - } - - print("Testing neuron endpoint...") - neuron_response = neuron_predictor.predict(test_payload) - print(f"Neuron response: {neuron_response}") - - print("Testing GPU endpoint...") - gpu_response = gpu_predictor.predict(test_payload) - print(f"GPU response: {gpu_response}") - - - #print("Cleaning up endpoints...") - #neuron_predictor.delete_endpoint() - #gpu_predictor.delete_endpoint()