diff --git a/.backportrc.json b/.backportrc.json
index e0862ca21..90b11a29a 100644
--- a/.backportrc.json
+++ b/.backportrc.json
@@ -1,5 +1,9 @@
 {
   "upstream": "elastic/stack-docs",
-  "branches": [{ "name": "8.15", "checked": true }, "8.14", "8.13", "8.12", "8.11", "8.10", "8.9", "8.8", "8.7", "8.6", "8.5", "8.4", "8.3", "8.2", "8.1", "8.0", "7.17", "7.16", "7.15", "7.14", "7.13", "7.12", "7.11", "7.10", "7.9", "7.8", "7.7", "7.6", "7.5", "7.4", "7.3", "7.2", "7.1", "7.0", "6.8", "6.7", "6.6", "6.5", "6.4", "6.3", "6.2", "6.1", "6.0", "5.6"],
+<<<<<<< HEAD
+  "branches": ["8.15", "8.14", "8.13", "8.12", "8.11", "8.10", "8.9", "8.8", "8.7", "8.6", "8.5", "8.4", "8.3", "8.2", "8.1", "8.0", "7.17", "7.16", "7.15", "7.14", "7.13", "7.12", "7.11", "7.10", "7.9", "7.8", "7.7", "7.6", "7.5", "7.4", "7.3", "7.2", "7.1", "7.0", "6.8", "6.7", "6.6", "6.5", "6.4", "6.3", "6.2", "6.1", "6.0", "5.6"],
+=======
+  "branches": [{ "name": "8.x", "checked": true }, "8.16", "8.15", "8.14", "8.13", "8.12", "8.11", "8.10", "8.9", "8.8", "8.7", "8.6", "8.5", "8.4", "8.3", "8.2", "8.1", "8.0", "7.17", "7.16", "7.15", "7.14", "7.13", "7.12", "7.11", "7.10", "7.9", "7.8", "7.7", "7.6", "7.5", "7.4", "7.3", "7.2", "7.1", "7.0", "6.8", "6.7", "6.6", "6.5", "6.4", "6.3", "6.2", "6.1", "6.0", "5.6"],
+>>>>>>> 7978a9a9 (Adds 8.16 branch to backport and mergify configuration. (#2861))
   "labels": ["backport"]
 }
diff --git a/.mergify.yml b/.mergify.yml
index 82556b463..744f0699d 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -15,6 +15,34 @@ pull_request_rules:
             git merge <remote-repo>/{{base}}
             git push <remote-repo> {{head}}
             ```
+  - name: backport patches to 8.17 branch
+    conditions:
+      - merged
+      - base=main
+      - label=backport-8.17
+    actions:
+      backport:
+        assignees:
+          - "{{ author }}"
+        branches:
+          - "8.x"
+        title: "[{{ destination_branch }}] {{ title }} (backport #{{ number }})"
+        labels:
+          - backport
+  - name: backport patches to 8.16 branch
+    conditions:
+      - merged
+      - base=main
+      - label=backport-8.16
+    actions:
+      backport:
+        assignees:
+          - "{{ author }}"
+        branches:
+          - "8.16"
+        title: "[{{ destination_branch }}] {{ title }} (backport #{{ number }})"
+        labels:
+          - backport
   - name: backport patches to 8.15 branch
     conditions:
       - merged
diff --git a/docs/en/glossary/glossary.asciidoc b/docs/en/glossary/glossary.asciidoc
index c3bd93de9..5392e6999 100644
--- a/docs/en/glossary/glossary.asciidoc
+++ b/docs/en/glossary/glossary.asciidoc
@@ -832,6 +832,10 @@ A network endpoint which is monitored to track the performance and availability
 applications and services.
 //Source: Observability
 
+[[glossary-multifactor]] multifactor authentication (MFA)::
+A security process that requires you to provide two or more verification methods to gain access to web-based user interfaces.
+//Souce: Cloud
+
 [[glossary-multi-field]] multi-field::
 A <<glossary-field,field>> that's <<glossary-mapping,mapped>> in multiple ways.
 See the {ref}/multi-fields.html[`fields` mapping parameter].
diff --git a/docs/en/glossary/terms/multifactor.asciidoc b/docs/en/glossary/terms/multifactor.asciidoc
new file mode 100644
index 000000000..8eaf3c1d4
--- /dev/null
+++ b/docs/en/glossary/terms/multifactor.asciidoc
@@ -0,0 +1,3 @@
+[[glossary-multifactor]] multifactor authentication (MFA)::
+A security process that requires you to provide two or more verification methods to gain access to web-based user interfaces.
+//Souce: Cloud
\ No newline at end of file
diff --git a/docs/en/install-upgrade/air-gapped-install.asciidoc b/docs/en/install-upgrade/air-gapped-install.asciidoc
index 6cf0276af..fcb8fe9a9 100644
--- a/docs/en/install-upgrade/air-gapped-install.asciidoc
+++ b/docs/en/install-upgrade/air-gapped-install.asciidoc
@@ -72,7 +72,8 @@ Specifically:
 * To be able to use {kib} sample data, install or update hundreds of prebuilt alert rules, and explore available data integrations, you need to set up and configure the <<air-gapped-elastic-package-registry,{package-registry}>>.
 * To provide detection rule updates for {endpoint-sec} agents, you need to set up and configure the <<air-gapped-elastic-endpoint-artifact-repository,Elastic Endpoint Artifact Repository>>.
 * To access {ents} capabilities (in addition to the general search capabilities of {es}), you need to set up and configure <<air-gapped-enterprise-search,{ents}>>.
-* To access the APM integration, you need to set up and configure <<air-gapped-elastic-apm,Elastic APM>>. 
+* To access the APM integration, you need to set up and configure <<air-gapped-elastic-apm,Elastic APM>>.
+* To install and use the Elastic documentation for {kib} AI assistants, you need to set up and configure the <<air-gapped-kibana-product-documentation,Elastic product documentation for {kib}>>.
 
 [discrete]
 [[air-gapped-beats]]
@@ -163,6 +164,12 @@ Some {ml} features, like natural language processing (NLP), require you to deplo
 * {ml-docs}/ml-nlp-elser.html#air-gapped-install[Deploy ELSER in an air-gapped environment].
 * {eland-docs}/machine-learning.html#ml-nlp-pytorch-air-gapped[Install trained models in an air-gapped environment with Eland].
 
+[discrete]
+[[air-gapped-kibana-product-documentation]]
+==== 1.14 {kib} Product documentation for AI Assistants
+
+Detailed install and configuration instructions are available in the {kibana-ref}/ai-assistant-settings-kb.html[{kib} AI Assistants settings documentation].
+
 [discrete]
 [[air-gapped-kubernetes-and-openshift]]
 === 2. Kubernetes & OpenShift Install
diff --git a/docs/en/install-upgrade/installing-stack-demo-secure.asciidoc b/docs/en/install-upgrade/installing-stack-demo-secure.asciidoc
index 21edfcc94..7dd3ad4b9 100644
--- a/docs/en/install-upgrade/installing-stack-demo-secure.asciidoc
+++ b/docs/en/install-upgrade/installing-stack-demo-secure.asciidoc
@@ -98,7 +98,7 @@ sudo /usr/share/elasticsearch/jdk/bin/keytool -importcert -trustcacerts -nopromp
 +
 ["source","shell"]
 ----
-keytool -keystore elastic-stack-ca.p12 -list
+sudo /usr/share/elasticsearch/jdk/bin/keytool -keystore /etc/elasticsearch/certs/elastic-stack-ca.p12 -list
 ----
 +
 NOTE: The keytool utility is provided as part of the {es} installation and is located at: `/usr/share/elasticsearch/jdk/bin/keytool` on RPM installations.
diff --git a/docs/en/install-upgrade/installing-stack-demo-self.asciidoc b/docs/en/install-upgrade/installing-stack-demo-self.asciidoc
index 39420348a..f2bcd38b0 100644
--- a/docs/en/install-upgrade/installing-stack-demo-self.asciidoc
+++ b/docs/en/install-upgrade/installing-stack-demo-self.asciidoc
@@ -305,6 +305,12 @@ Note the following tips about enrollment tokens:
 ====
 
 . In the terminal shell for your second {es} node, pass the enrollment token as a parameter to the `elasticsearch-reconfigure-node` tool:
+[IMPORTANT]
+====
+Be sure the second node is able to access the first node before running the following command. You can test this by running a curl command to the first node at port 9200.
+
+If you are unable to access the first node, modify your network configuration before proceeding.
+====
 +
 [source,"shell"]
 ----
@@ -352,11 +358,11 @@ network.host: 10.128.0.132
 sudo systemctl start elasticsearch.service
 ----
 
-. **Optionally**, to view the progress as the second {es} node starts up and connects to the first {es} node, open a new terminal into the second node and `tail` the {es} log file:
+. **Optionally**, to view the progress as the second {es} node starts up and connects to the first {es} node, open a new terminal into the second node and `tail` the {es} log file. Be sure to replace <CLUSTER.NAME> with the cluster.name you set earlier in the first node's elasticsearch.yml:
 +
 [source,"shell"]
 ----
-sudo tail -f /var/log/elasticsearch/elasticsearch-demo.log
+sudo tail -f /var/log/elasticsearch/<CLUSTER.NAME>.log
 ----
 +
 Notice in the log file some helpful diagnostics, such as:
diff --git a/docs/en/install-upgrade/upgrading-stack.asciidoc b/docs/en/install-upgrade/upgrading-stack.asciidoc
index 4219c92ac..3371fb999 100644
--- a/docs/en/install-upgrade/upgrading-stack.asciidoc
+++ b/docs/en/install-upgrade/upgrading-stack.asciidoc
@@ -37,15 +37,42 @@ endif::[]
 compatible with {es} version {version}.
 
 . Test the upgrade in an isolated environment before upgrading your production
-cluster.
+  cluster.
++
+[IMPORTANT]
+====
+The upgraded version of {es} may interact with its environment in different
+ways from the version you are currently running. It is possible that your
+environment behaves incorrectly in a way that does not matter to the version of
+{es} that you are currently running, but which does matter to the upgraded
+version. In this case, the upgraded version will not work correctly until you
+address the incorrect behaviour in your environment.
+
+During your upgrade tests, pay particular attention to the following aspects:
+
+Cluster stability:: Does the new version of {es} form a stable healthy cluster?
+
+Indexing and search performance:: Does the new version of {es} perform the same
+(or better) than the current one on your specific workload and data?
+
+Snapshots:: Do all of your snapshot repositories work correctly and pass
+{ref}/repo-analysis-api.html[repository analysis]?
+====
 
 . Make sure you have a current snapshot before you start the upgrade.
 +
-IMPORTANT: You cannot downgrade {es} nodes after upgrading. 
-If you cannot complete the upgrade process, 
-you will need to restore from the snapshot.
+IMPORTANT: You cannot downgrade {es} nodes after starting to upgrade your
+cluster. If you cannot complete the upgrade process, build a new cluster and
+restore a snapshot taken before starting the upgrade.
 
-. If you use a separate {ref}/monitoring-production.html[monitoring cluster], you should upgrade the monitoring cluster before the production cluster. In general, the monitoring cluster and the clusters being monitored should be running the same version of the stack. A monitoring cluster cannot monitor production clusters running newer versions of the stack. If necessary, the monitoring cluster can monitor production clusters running the latest release of the previous major version.
+. If you use a separate {ref}/monitoring-production.html[monitoring cluster],
+  upgrade the monitoring cluster before the production cluster.
++
+In general, the monitoring cluster and the clusters being monitored should be
+running the same version of the stack. A monitoring cluster cannot monitor
+production clusters running newer versions of the stack. If necessary, the
+monitoring cluster can monitor production clusters running the latest release
+of the previous major version.
 // end::generic-upgrade-steps[]
 ////
 
diff --git a/docs/en/stack/ml/anomaly-detection/anomaly-detection-scale.asciidoc b/docs/en/stack/ml/anomaly-detection/anomaly-detection-scale.asciidoc
index 5326b8567..d338f9101 100644
--- a/docs/en/stack/ml/anomaly-detection/anomaly-detection-scale.asciidoc
+++ b/docs/en/stack/ml/anomaly-detection/anomaly-detection-scale.asciidoc
@@ -65,7 +65,7 @@ size of a {ml} node. Creating {ml-jobs} with model memory limits larger than the
 maximum node size can support is not allowed, as autoscaling cannot add a node 
 big enough to run the job. On a self-managed deployment, you can set 
 `xpack.ml.max_model_memory_limit` according to the available resources of the 
-{ml} node. This prevents you from you creating jobs with model memory limits too 
+{ml} node. This prevents you from creating jobs with model memory limits too 
 high to open in your cluster.
 
 [discrete]
diff --git a/docs/en/stack/ml/anomaly-detection/ml-limitations.asciidoc b/docs/en/stack/ml/anomaly-detection/ml-limitations.asciidoc
index 5a6792721..aca674bc9 100644
--- a/docs/en/stack/ml/anomaly-detection/ml-limitations.asciidoc
+++ b/docs/en/stack/ml/anomaly-detection/ml-limitations.asciidoc
@@ -110,18 +110,6 @@ Analyzing large arrays results in long strings which may require more system
 resources. Consider using a query in the {dfeed} that filters on the relevant 
 items of the array.
 
-
-[discrete]
-[[ml-frozen-limitations]]
-=== Frozen indices are not supported
-
-{ref}/frozen-indices.html[Frozen indices] cannot be used in {anomaly-jobs} or 
-{dfeeds}. This limitation applies irrespective of whether you create the jobs in 
-{kib} or by using APIs. This limitation exists because it's currently not
-possible to specify the `ignore_throttled` query parameter for search requests
-in {dfeeds} or jobs. See
-{ref}/searching_a_frozen_index.html[Searching a frozen index].
-
 [discrete]
 [[ml-frozen-tier-limitations]]
 === {anomaly-jobs-cap} on frozen tier data cannot be created in {kib}
diff --git a/docs/en/stack/ml/nlp/images/ml-nlp-deployment-id-elser-v2.png b/docs/en/stack/ml/nlp/images/ml-nlp-deployment-id-elser-v2.png
index 4ab018c87..d549ea815 100644
Binary files a/docs/en/stack/ml/nlp/images/ml-nlp-deployment-id-elser-v2.png and b/docs/en/stack/ml/nlp/images/ml-nlp-deployment-id-elser-v2.png differ
diff --git a/docs/en/stack/ml/nlp/images/ml-nlp-test-ner.png b/docs/en/stack/ml/nlp/images/ml-nlp-test-ner.png
index 36541436f..e0f187e68 100644
Binary files a/docs/en/stack/ml/nlp/images/ml-nlp-test-ner.png and b/docs/en/stack/ml/nlp/images/ml-nlp-test-ner.png differ
diff --git a/docs/en/stack/ml/nlp/index.asciidoc b/docs/en/stack/ml/nlp/index.asciidoc
index 1dd021414..206f72919 100644
--- a/docs/en/stack/ml/nlp/index.asciidoc
+++ b/docs/en/stack/ml/nlp/index.asciidoc
@@ -4,10 +4,12 @@ include::ml-nlp-extract-info.asciidoc[leveloffset=+2]
 include::ml-nlp-classify-text.asciidoc[leveloffset=+2]
 include::ml-nlp-search-compare.asciidoc[leveloffset=+2]
 include::ml-nlp-deploy-models.asciidoc[leveloffset=+1]
+include::ml-nlp-autoscaling.asciidoc[leveloffset=+1]
 include::ml-nlp-inference.asciidoc[leveloffset=+1]
 include::ml-nlp-apis.asciidoc[leveloffset=+1]
 include::ml-nlp-built-in-models.asciidoc[leveloffset=+1]
 include::ml-nlp-elser.asciidoc[leveloffset=+2]
+include::ml-nlp-elastic-rerank.asciidoc[leveloffset=+2]
 include::ml-nlp-e5.asciidoc[leveloffset=+2]
 include::ml-nlp-lang-ident.asciidoc[leveloffset=+2]
 include::ml-nlp-model-ref.asciidoc[leveloffset=+1]
diff --git a/docs/en/stack/ml/nlp/ml-nlp-autoscaling.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-autoscaling.asciidoc
new file mode 100644
index 000000000..0906621e7
--- /dev/null
+++ b/docs/en/stack/ml/nlp/ml-nlp-autoscaling.asciidoc
@@ -0,0 +1,159 @@
+[[ml-nlp-auto-scale]]
+= Trained model autoscaling
+
+You can enable autoscaling for each of your trained model deployments.
+Autoscaling allows {es} to automatically adjust the resources the model deployment can use based on the workload demand.
+
+There are two ways to enable autoscaling:
+
+* through APIs by enabling adaptive allocations
+* in {kib} by enabling adaptive resources
+
+IMPORTANT: To fully leverage model autoscaling, it is highly recommended to enable {cloud}/ec-autoscaling.html[{es} deployment autoscaling].
+
+
+[discrete]
+[[nlp-model-adaptive-allocations]]
+== Enabling autoscaling through APIs - adaptive allocations
+
+Model allocations are independent units of work for NLP tasks.
+If you set the numbers of threads and allocations for a model manually, they remain constant even when not all the available resources are fully used or when the load on the model requires more resources.
+Instead of setting the number of allocations manually, you can enable adaptive allocations to set the number of allocations based on the load on the process.
+This can help you to manage performance and cost more easily.
+(Refer to the https://cloud.elastic.co/pricing[pricing calculator] to learn more about the possible costs.)
+
+When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.
+When the load is high, a new model allocation is automatically created.
+When the load is low, a model allocation is automatically removed.
+You can explicitely set the minimum and maximum number of allocations; autoscaling will occur within these limits.
+
+You can enable adaptive allocations by using:
+
+* the create inference endpoint API for {ref}/infer-service-elser.html[ELSER], {ref}/infer-service-elasticsearch.html[E5 and models uploaded through Eland] that are used as {infer} services.
+* the {ref}/start-trained-model-deployment.html[start trained model deployment] or {ref}/update-trained-model-deployment.html[update trained model deployment] APIs for trained models that are deployed on {ml} nodes.
+
+If the new allocations fit on the current {ml} nodes, they are immediately started.
+If more resource capacity is needed for creating new model allocations, then your {ml} node will be scaled up if {ml} autoscaling is enabled to provide enough resources for the new allocation.
+The number of model allocations can be scaled down to 0.
+They cannot be scaled up to more than 32 allocations, unless you explicitly set the maximum number of allocations to more.
+Adaptive allocations must be set up independently for each deployment and {ref}/put-inference-api.html[{infer} endpoint].
+
+
+[discrete]
+[[optimize-use-case]]
+=== Optimizing for typical use cases
+
+You can optimize your model deployment for typical use cases, such as search and ingest.
+When you optimize for ingest, the throughput will be higher, which increases the number of {infer} requests that can be performed in parallel.
+When you optimize for search, the latency will be lower during search processes.
+
+* If you want to optimize for ingest, set the number of threads to `1` (`"threads_per_allocation": 1`).
+* If you want to optimize for search, set the number of threads to greater than `1`.
+Increasing the number of threads will make the search processes more performant.
+
+
+[discrete]
+[[nlp-model-adaptive-resources]]
+== Enabling autoscaling in {kib} - adaptive resources
+
+You can enable adaptive resources for your models when starting or updating the model deployment.
+Adaptive resources make it possible for {es} to scale up or down the available resources based on the load on the process.
+This can help you to manage performance and cost more easily.
+When adaptive resources are enabled, the number of vCPUs that the model deployment uses is set automatically based on the current load.
+When the load is high, the number of vCPUs that the process can use is automatically increased.
+When the load is low, the number of vCPUs that the process can use is automatically decreased.
+
+You can choose from three levels of resource usage for your trained model deployment; autoscaling will occur within the selected level's range.
+
+Refer to the tables in the <<auto-scaling-matrix>> section to find out the setings for the level you selected.
+
+
+[role="screenshot"]
+image::images/ml-nlp-deployment-id-elser-v2.png["ELSER deployment with adaptive resources enabled.",width=640]
+
+
+[discrete]
+[[auto-scaling-matrix]]
+== Model deployment resource matrix
+
+The used resources for trained model deployments depend on three factors:
+
+* your cluster environment (Serverless, Cloud, or on-premises)
+* the use case you optimize the model deployment for (ingest or search)
+* whether model autoscaling is enabled with adaptive allocations/resources to have dynamic resources, or disabled for static resources
+
+If you use {es} on-premises, vCPUs level ranges are derived from the `total_ml_processors` and `max_single_ml_node_processors` values.
+Use the {ref}/get-ml-info.html[get {ml} info API] to check these values.
+The following tables show you the number of allocations, threads, and vCPUs available in Cloud when adaptive resources are enabled or disabled.
+
+NOTE: On Serverless, adaptive allocations are automatically enabled for all project types.
+However, the "Adaptive resources" control is not displayed in {kib} for Observability and Security projects.
+
+
+[discrete]
+=== Deployments in Cloud optimized for ingest
+
+In case of ingest-optimized deployments, we maximize the number of model allocations.
+
+
+[discrete]
+==== Adaptive resources enabled
+
+[cols="4*", options="header"]
+|==========
+| Level  | Allocations                                          | Threads | vCPUs
+| Low    | 0 to 2 if available, dynamically                     | 1       | 0 to 2 if available, dynamically 
+| Medium | 1 to 32 dynamically                                  | 1       | 1 to the smaller of 32 or the limit set in the Cloud console, dynamically
+| High   | 1 to limit set in the Cloud console ^*^, dynamically | 1       | 1 to limit set in the Cloud console, dynamically
+|==========
+
+^*^ The Cloud console doesn't directly set an allocations limit; it only sets a vCPU limit.
+This vCPU limit indirectly determines the number of allocations, calculated as the vCPU limit divided by the number of threads.
+
+[discrete]
+==== Adaptive resources disabled
+
+[cols="4*", options="header"]
+|==========
+| Level  | Allocations                                                                  | Threads | vCPUs
+| Low    | 2 if available, otherwise 1, statically                                      | 1       | 2 if available
+| Medium | the smaller of 32 or the limit set in the Cloud console, statically          | 1       | 32 if available
+| High   | Maximum available set in the  Cloud console ^*^, statically                  | 1       | Maximum available set in the Cloud console, statically
+|==========
+
+^*^ The Cloud console doesn't directly set an allocations limit; it only sets a vCPU limit.
+This vCPU limit indirectly determines the number of allocations, calculated as the vCPU limit divided by the number of threads.
+
+[discrete]
+=== Deployments in Cloud optimized for search
+
+In case of search-optimized deployments, we maximize the number of threads.
+The maximum number of threads that can be claimed depends on the hardware your architecture has.
+
+[discrete]
+==== Adaptive resources enabled
+
+[cols="4*", options="header"]
+|==========
+| Level  | Allocations                                          | Threads                                            | vCPUs
+| Low    |  1                                                   | 2                                                  | 2
+| Medium |  1 to 2 (if threads=16) dynamically                  | maximum that the hardware allows (for example, 16) | 1 to 32 dynamically
+| High   |  1 to limit set in the Cloud console ^*^, dynamically| maximum that the hardware allows (for example, 16) | 1 to limit set in the Cloud console, dynamically
+|==========
+
+^*^ The Cloud console doesn't directly set an allocations limit; it only sets a vCPU limit.
+This vCPU limit indirectly determines the number of allocations, calculated as the vCPU limit divided by the number of threads.
+
+[discrete]
+==== Adaptive resources disabled
+
+[cols="4*", options="header"]
+|==========
+| Level  | Allocations                                                      | Threads                                                  | vCPUs
+| Low    | 1 if available, statically                                       | 2                                                        | 2 if available
+| Medium | 2 (if threads=16) statically                                     | maximum that the hardware allows (for example, 16)       | 32 if available
+| High   | Maximum available set in the Cloud console ^*^, statically       | maximum that the hardware allows (for example, 16)       | Maximum available set in the Cloud console, statically
+|==========
+
+^*^ The Cloud console doesn't directly set an allocations limit; it only sets a vCPU limit.
+This vCPU limit indirectly determines the number of allocations, calculated as the vCPU limit divided by the number of threads.
\ No newline at end of file
diff --git a/docs/en/stack/ml/nlp/ml-nlp-deploy-models.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-deploy-models.asciidoc
index 5e10e7004..06c063f1b 100644
--- a/docs/en/stack/ml/nlp/ml-nlp-deploy-models.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp-deploy-models.asciidoc
@@ -164,66 +164,34 @@ their deployment across your cluster under **{ml-app}** > *Model Management*.
 Alternatively, you can use the
 {ref}/start-trained-model-deployment.html[start trained model deployment API].
 
-You can deploy a model multiple times by assigning a unique deployment ID when 
-starting the deployment. It enables you to have dedicated deployments for 
-different purposes, such as search and ingest. By doing so, you ensure that the 
-search speed remains unaffected by ingest workloads, and vice versa. Having 
-separate deployments for search and ingest mitigates performance issues 
-resulting from interactions between the two, which can be hard to diagnose.
+You can deploy a model multiple times by assigning a unique deployment ID when starting the deployment.
+
+You can optimize your deplyoment for typical use cases, such as search and ingest.
+When you optimize for ingest, the throughput will be higher, which increases the number of {infer} requests that can be performed in parallel.
+When you optimize for search, the latency will be lower during search processes.
+When you have dedicated deployments for different purposes, you ensure that the search speed remains unaffected by ingest workloads, and vice versa.
+Having separate deployments for search and ingest mitigates performance issues resulting from interactions between the two, which can be hard to diagnose.
 
 [role="screenshot"]
-image::images/ml-nlp-deployment-id-elser-v2.png["Model deployment on the Trained Models UI."]
-
-It is recommended to fine-tune each deployment based on its specific purpose. To 
-improve ingest performance, increase throughput by adding more allocations to 
-the deployment. For improved search speed, increase the number of threads per 
-allocation.
-
-NOTE: Since eland uses APIs to deploy the models, you cannot see the models in
-{kib} until the saved objects are synchronized. You can follow the prompts in
-{kib}, wait for automatic synchronization, or use the
-{kibana-ref}/machine-learning-api-sync.html[sync {ml} saved objects API].
-
-When you deploy the model, its allocations are distributed across available {ml} 
-nodes. Model allocations are independent units of work for NLP tasks. To 
-influence model performance, you can configure the number of allocations and the 
-number of threads used by each allocation of your deployment. Alternatively, you
-can enable <<nlp-model-adaptive-allocations>> to automatically create and remove
-model allocations based on the current workload of the model (you still need to 
-manually set the number of threads).
-
-IMPORTANT: If your deployed trained model has only one allocation, it's likely 
-that you will experience downtime in the service your trained model performs. 
-You can reduce or eliminate downtime by adding more allocations to your trained 
-models.
+image::images/ml-nlp-deployment-id-elser-v2.png["Model deployment on the Trained Models UI.",width=640]
 
-Throughput can be scaled by adding more allocations to the deployment; it 
-increases the number of {infer} requests that can be performed in parallel. All 
-allocations assigned to a node share the same copy of the model in memory. The 
-model is loaded into memory in a native process that encapsulates `libtorch`, 
-which is the underlying {ml} library of PyTorch. The number of allocations 
-setting affects the amount of model allocations across all the {ml} nodes. Model 
-allocations are distributed in such a way that the total number of used threads 
-does not exceed the allocated processors of a node.
-
-The threads per allocation setting affects the number of threads used by each 
-model allocation during {infer}. Increasing the number of threads generally 
-increases the speed of {infer} requests. The value of this setting must not 
-exceed the number of available allocated processors per node.
-
-You can view the allocation status in {kib} or by using the
-{ref}/get-trained-models-stats.html[get trained model stats API]. If you want to
-change the number of allocations, you can use the
-{ref}/update-trained-model-deployment.html[update trained model stats API] after
-the allocation status is `started`. You can also enable
-<<nlp-model-adaptive-allocations>> to automatically create and remove model
-allocations based on the current workload of the model.
+Each deployment will be fine-tuned automatically based on its specific purpose you choose.
 
-[discrete]
-[[nlp-model-adaptive-allocations]]
-=== Adaptive allocations
+NOTE: Since eland uses APIs to deploy the models, you cannot see the models in {kib} until the saved objects are synchronized.
+You can follow the prompts in {kib}, wait for automatic synchronization, or use the {kibana-ref}/machine-learning-api-sync.html[sync {ml} saved objects API].
+
+You can define the resource usage level of the NLP model during model deployment.
+The resource usage levels behave differently depending on <<nlp-model-adaptive-resources, adaptive resources>> being enabled or disabled.
+When adaptive resources are disabled but {ml} autoscaling is enabled, vCPU usage of Cloud deployments derived from the Cloud console and functions as follows:
+
+* Low: This level limits resources to two vCPUs, which may be suitable for development, testing, and demos depending on your parameters.
+It is not recommended for production use
+* Medium: This level limits resources to 32 vCPUs, which may be suitable for development, testing, and demos depending on your parameters.
+It is not recommended for production use.
+* High: This level may use the maximum number of vCPUs available for this deployment from the Cloud console.
+If the maximum is 2 vCPUs or fewer, this level is equivalent to the medium or low level.
 
-include::ml-nlp-shared.asciidoc[tag=ml-nlp-adaptive-allocations]
+For the resource levels when adaptive resources are enabled, refer to <<<ml-nlp-auto-scale>>.
 
 
 [discrete]
@@ -263,7 +231,7 @@ The simplest method to test your model against new data is to use the
 field of an existing index in your cluster to test the model:
 
 [role="screenshot"]
-image::images/ml-nlp-test-ner.png[Testing a sentence with two named entities against a NER trained model in the *{ml}* app]
+image::images/ml-nlp-test-ner.png["Testing a sentence with two named entities against a NER trained model in the *{ml}* app",]
 
 Alternatively, you can use the
 {ref}/infer-trained-model.html[infer trained model API].
diff --git a/docs/en/stack/ml/nlp/ml-nlp-e5.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-e5.asciidoc
index 8d3a4b256..8a5f1c735 100644
--- a/docs/en/stack/ml/nlp/ml-nlp-e5.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp-e5.asciidoc
@@ -41,6 +41,9 @@ models on HuggingFace for further information including licensing.
 To use E5, you must have the {subscriptions}[appropriate subscription] level 
 for semantic search or the trial period activated.
 
+Enabling trained model autoscaling for your E5 deployment is recommended.
+Refer to <<ml-nlp-auto-scale>> to learn more.
+
 
 [discrete]
 [[download-deploy-e5]]
@@ -275,7 +278,7 @@ your system.
 --
 [source,bash]
 ----
-git clone https://huggingface.co/elastic/multilingual-e5-small
+git clone https://huggingface.co/intfloat/multilingual-e5-small
 ----
 The command results in a local copy of the model in the `multilingual-e5-small`
 directory.
@@ -313,12 +316,6 @@ Once it's uploaded to {es}, the model will have the ID specified by
 underscores `__`.
 --
 
-[discrete]
-[[e5-adaptive-allocations]]
-== Adaptive allocations
-
-include::ml-nlp-shared.asciidoc[tag=ml-nlp-adaptive-allocations]
-
 
 [discrete]
 [[terms-of-use-e5]]
diff --git a/docs/en/stack/ml/nlp/ml-nlp-elastic-rerank.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-elastic-rerank.asciidoc
new file mode 100644
index 000000000..982831bb7
--- /dev/null
+++ b/docs/en/stack/ml/nlp/ml-nlp-elastic-rerank.asciidoc
@@ -0,0 +1,365 @@
+[[ml-nlp-rerank]]
+= Elastic Rerank
+
+Elastic Rerank is a state-of-the-art cross-encoder reranking model trained by Elastic that helps you improve search relevance with a few simple API calls.
+Elastic Rerank is Elastic's first semantic reranking model and is available out-of-the-box in supporting Elastic deployments using the {es} Inference API.
+
+Use Elastic Rerank to improve existing search applications including:
+
+* Traditional BM25 scoring
+* Hybrid semantic search
+* Retrieval Augmented Generation (RAG)
+
+The model can significantly improve search result quality by reordering results based on deeper semantic understanding of queries and documents.
+
+When reranking BM25 results, it provides an average 40% improvement in ranking quality on a diverse benchmark of retrieval tasks— matching the performance of models 11x its size.
+
+[discrete]
+[[ml-nlp-rerank-availability]]
+== Availability and requirements 
+
+experimental[] 
+
+[discrete]
+[[ml-nlp-rerank-availability-serverless]]
+=== Elastic Cloud Serverless
+
+Elastic Rerank is available in {es} Serverless projects as of November 25, 2024.
+
+[discrete]
+[[ml-nlp-rerank-availability-elastic-stack]]
+=== Elastic Cloud Hosted and self-managed deployments
+
+Elastic Rerank is available in Elastic Stack version 8.17+:
+
+* To use Elastic Rerank, you must have the appropriate subscription level or the trial period activated.
+* A 4GB ML node
++
+[IMPORTANT]
+====
+Deploying the Elastic Rerank model in combination with ELSER (or other hosted models) requires at minimum an 8GB ML node. The current maximum size for trial ML nodes is 4GB (defaults to 1GB). 
+====
+
+[discrete]
+[[ml-nlp-rerank-deploy]]
+== Download and deploy
+
+To download and deploy Elastic Rerank, use the {ref}/infer-service-elasticsearch.html[create inference API] to create an {es} service `rerank` endpoint.
+
+[TIP]
+====
+Refer to this https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/12-semantic-reranking-elastic-rerank.ipynb[Python notebook] for an end-to-end example using Elastic Rerank.
+====
+
+[discrete]
+[[ml-nlp-rerank-deploy-steps]]
+=== Create an inference endpoint
+
+. In {kib}, navigate to the *Dev Console*.
+
+. Create an {infer} endpoint with the Elastic Rerank service by running:
++
+[source,console]
+----------------------------------
+PUT _inference/rerank/my-rerank-model
+{
+  "service": "elasticsearch",
+  "service_settings": {
+    "adaptive_allocations": {
+      "enabled": true,
+      "min_number_of_allocations": 1,
+      "max_number_of_allocations": 10
+    },
+    "num_threads": 1,
+    "model_id": ".rerank-v1"
+  }
+}
+----------------------------------
++
+NOTE: The API request automatically downloads and deploys the model. This example uses <<ml-nlp-auto-scale,autoscaling>> through adaptive allocation.
+
+[NOTE]
+====
+You might see a 502 bad gateway error in the response when using the {kib} Console.
+This error usually just reflects a timeout, while the model downloads in the background.
+You can check the download progress in the {ml-app} UI.
+If using the Python client, you can set the `timeout` parameter to a higher value.
+====
+
+After creating the Elastic Rerank {infer} endpoint, it's ready to use with a {ref}/retriever.html#text-similarity-reranker-retriever-example-elastic-rerank[`text_similarity_reranker`] retriever.
+
+[discrete]
+[[ml-nlp-rerank-deploy-verify]]
+== Deploy in an air-gapped environment
+
+If you want to deploy the Elastic Rerank model in a restricted or closed network, you have two options:
+
+* Create your own HTTP/HTTPS endpoint with the model artifacts on it
+* Put the model artifacts into a directory inside the config directory on all master-eligible nodes.
+
+[discrete]
+[[ml-nlp-rerank-model-artifacts]]
+=== Model artifact files
+
+For the cross-platform version, you need the following files in your system:
+```
+https://ml-models.elastic.co/rerank-v1.metadata.json
+https://ml-models.elastic.co/rerank-v1.pt
+https://ml-models.elastic.co/rerank-v1.vocab.json
+```
+
+// For the optimized version, you need the following files in your system:
+// ```
+// https://ml-models.elastic.co/rerank-v1_linux-x86_64.metadata.json
+// https://ml-models.elastic.co/rerank-v1_linux-x86_64.pt
+// https://ml-models.elastic.co/rerank-v1_linux-x86_64.vocab.json
+// ```
+
+[discrete]
+=== Using an HTTP server
+
+INFO: If you use an existing HTTP server, note that the model downloader only 
+supports passwordless HTTP servers.
+
+You can use any HTTP service to deploy the model. This example uses the official 
+Nginx Docker image to set a new HTTP download service up.
+
+. Download the <<ml-nlp-rerank-model-artifacts,model artifact files>>.
+. Put the files into a subdirectory of your choice.
+. Run the following commands:
++
+--
+[source, shell]
+--------------------------------------------------
+export ELASTIC_ML_MODELS="/path/to/models"
+docker run --rm -d -p 8080:80 --name ml-models -v ${ELASTIC_ML_MODELS}:/usr/share/nginx/html nginx
+--------------------------------------------------
+
+Don't forget to change `/path/to/models` to the path of the subdirectory where 
+the model artifact files are located.
+
+These commands start a local Docker image with an Nginx server with the 
+subdirectory containing the model files. As the Docker image has to be 
+downloaded and built, the first start might take a longer period of time. 
+Subsequent runs start quicker.
+--
+. Verify that Nginx runs properly by visiting the following URL in your 
+browser:
++
+--
+```
+http://{IP_ADDRESS_OR_HOSTNAME}:8080/rerank-v1.metadata.json
+```
+
+If Nginx runs properly, you see the content of the metdata file of the model.
+--
+. Point your {es} deployment to the model artifacts on the HTTP server
+by adding the following line to the `config/elasticsearch.yml` file: 
++
+--
+```
+xpack.ml.model_repository: http://{IP_ADDRESS_OR_HOSTNAME}:8080
+```
+
+If you use your own HTTP or HTTPS server, change the address accordingly. It is 
+important to specificy the protocol ("http://" or "https://"). Ensure that all 
+master-eligible nodes can reach the server you specify.
+--
+. Repeat step 5 on all master-eligible nodes.
+. {ref}/restart-cluster.html#restart-cluster-rolling[Restart] the 
+master-eligible nodes one by one.
+. Create an inference endpoint to deploy the model per <<ml-nlp-rerank-deploy-steps,these steps>>.
+
+The HTTP server is only required for downloading the model. After the download 
+has finished, you can stop and delete the service. You can stop the Docker image 
+used in this example by running the following command:
+
+[source, shell]
+--------------------------------------------------
+docker stop ml-models
+--------------------------------------------------
+
+[discrete]
+=== Using file-based access
+
+For a file-based access, follow these steps:
+
+. Download the <<ml-nlp-rerank-model-artifacts,model artifact files>>. 
+. Put the files into a `models` subdirectory inside the `config` directory of 
+your {es} deployment.
+. Point your {es} deployment to the model directory by adding the 
+following line to the `config/elasticsearch.yml` file:
++
+--
+```
+xpack.ml.model_repository: file://${path.home}/config/models/
+```
+--
+. Repeat step 2 and step 3 on all master-eligible nodes.
+. {ref}/restart-cluster.html#restart-cluster-rolling[Restart] the 
+master-eligible nodes one by one.
+. Create an inference endpoint to deploy the model per <<ml-nlp-rerank-deploy-steps,these steps>>.
+
+[discrete]
+[[ml-nlp-rerank-limitations]]
+== Limitations
+
+* English language only
+* Maximum context window of 512 tokens
++
+When using the {ref}/semantic-text.html[`semantic_text` field type], text is divided into chunks. By default, each chunk contains 250 words (approximately 400 tokens). Be cautious when increasing the chunk size - if the combined length of your query and chunk text exceeds 512 tokens, the model won't have access to the full content.
++
+When the combined inputs exceed the 512 token limit, a balanced truncation strategy is used. If both the query and input text are longer than 255 tokens each then both are truncated, otherwise the longest is truncated.
+
+[discrete]
+[[ml-nlp-rerank-perf-considerations]]
+== Performance considerations
+
+It's important to note that if you rerank to depth `n` then you will need to run `n` inferences per query. This will include the document text and will therefore be significantly more expensive than inference for query embeddings. Hardware can be scaled to run these inferences in parallel, but we would recommend shallow reranking for CPU inference: no more than top-30 results. You may find that the preview version is cost prohibitive for high query rates and low query latency requirements. We plan to address performance issues for GA.
+
+[discrete]
+[[ml-nlp-rerank-model-specs]]
+== Model specifications
+
+* Purpose-built for English language content
+
+* Relatively small: 184M parameters (86M backbone + 98M embedding layer)
+
+* Matches performance of billion-parameter reranking models
+
+* Built directly into {es} - no external services or dependencies needed
+
+[discrete]
+[[ml-nlp-rerank-arch-overview]]
+== Model architecture
+
+Elastic Rerank is built on the https://arxiv.org/abs/2111.09543[DeBERTa v3] language model architecture.
+
+The model employs several key architectural features that make it particularly effective for reranking:
+
+* *Disentangled attention mechanism* enables the model to:
+** Process word content and position separately
+** Learn more nuanced relationships between query and document text
+** Better understand the semantic importance of word positions and relationships
+
+* *ELECTRA-style pre-training* uses:
+** A GAN-like approach to token prediction
+** Simultaneous training of token generation and detection
+** Enhanced parameter efficiency compared to traditional masked language modeling
+
+[discrete]
+[[ml-nlp-rerank-arch-training]]
+== Training process
+
+Here is an overview of the Elastic Rerank model training process:
+
+* *Initial relevance extraction*
+** Fine-tunes the pre-trained DeBERTa [CLS] token representation
+** Uses a GeLU activation and dropout layer
+** Preserves important pre-trained knowledge while adapting to the reranking task
+
+* *Trained by distillation*
+** Uses an ensemble of bi-encoder and cross-encoder models as a teacher
+** Bi-encoder provides nuanced negative example assessment
+** Cross-encoder helps differentiate between positive and negative examples
+** Combines strengths of both model types
+
+[discrete]
+[[ml-nlp-rerank-arch-data]]
+=== Training data
+
+The training data consists of:
+
+* Open domain Question-Answering datasets
+* Natural document pairs (like article headings and summaries)
+* 180,000 synthetic query-passage pairs with varying relevance
+* Total of approximately 3 million queries
+
+The data preparation process includes:
+
+* Basic cleaning and fuzzy deduplication
+* Multi-stage prompting for diverse topics (on the synthetic portion of the training data only)
+* Varied query types:
+** Keyword search
+** Exact phrase matching
+** Short and long natural language questions
+
+[discrete]
+[[ml-nlp-rerank-arch-sampling]]
+=== Negative sampling
+
+The model uses an advanced sampling strategy to ensure high-quality rankings:
+
+* Samples from top 128 documents per query using multiple retrieval methods
+* Uses five negative samples per query - more than typical approaches
+* Applies probability distribution shaped by document scores for sampling
+
+* Deep sampling benefits:
+** Improves model robustness across different retrieval depths
+** Enhances score calibration
+** Provides better handling of document diversity
+
+[discrete]
+[[ml-nlp-rerank-arch-optimization]]
+=== Training optimization
+
+The training process incorporates several key optimizations:
+
+Uses cross-entropy loss function to:
+
+* Model relevance as probability distribution
+* Learn relationships between all document scores
+* Fit scores through maximum likelihood estimation
+
+Implemented parameter averaging along optimization trajectory:
+
+* Eliminates need for traditional learning rate scheduling and provides improvement in the final model quality
+
+[discrete]
+[[ml-nlp-rerank-performance]]
+== Performance
+
+Elastic Rerank shows significant improvements in search quality across a wide range of retrieval tasks.
+
+[discrete]
+[[ml-nlp-rerank-performance-overview]]
+=== Overview
+
+* Average 40% improvement in ranking quality when reranking BM25 results
+* 184M parameter model matches performance of 2B parameter alternatives
+* Evaluated across 21 different datasets using the BEIR benchmark suite
+
+[discrete]
+[[ml-nlp-rerank-performance-benchmarks]]
+=== Key benchmark results
+
+* Natural Questions: 90% improvement
+* MS MARCO: 85% improvement
+* Climate-FEVER: 80% improvement
+* FiQA-2018: 76% improvement
+
+For detailed benchmark information, including complete dataset results and methodology, refer to the https://www.elastic.co/search-labs/blog/elastic-semantic-reranker-part-2[Introducing Elastic Rerank blog].
+
+// [discrete]
+// [[ml-nlp-rerank-benchmarks-hw]]
+// === Hardware benchmarks
+// Note: these are more for GA timeframe
+
+[discrete]
+[[ml-nlp-rerank-resources]]
+== Further resources
+
+*Documentation*:
+
+* {ref}/semantic-reranking.html#semantic-reranking-in-es[Semantic re-ranking in {es} overview]
+* {ref}/infer-service-elasticsearch.html#inference-example-elastic-reranker[Inference API example]
+
+*Blogs*:
+
+* https://www.elastic.co/search-labs/blog/elastic-semantic-reranker-part-1[Part 1]
+* https://www.elastic.co/search-labs/blog/elastic-semantic-reranker-part-2[Part 2]
+* https://www.elastic.co/search-labs/blog/elastic-semantic-reranker-part-3[Part 3]
+
+*Python notebooks*:
+
+* https://github.com/elastic/elasticsearch-labs/blob/main/notebooks/search/12-semantic-reranking-elastic-rerank.ipynb[End-to-end example using Elastic Rerank in Python]
diff --git a/docs/en/stack/ml/nlp/ml-nlp-elser.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-elser.asciidoc
index e4faee8d7..a3b645e88 100644
--- a/docs/en/stack/ml/nlp/ml-nlp-elser.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp-elser.asciidoc
@@ -66,6 +66,9 @@ more allocations or more threads per allocation, which requires bigger ML nodes.
 Autoscaling provides bigger nodes when required. If autoscaling is turned off, 
 you must provide suitably sized nodes yourself.
 
+Enabling trained model autoscaling for your ELSER deployment is recommended.
+Refer to <<ml-nlp-auto-scale>> to learn more.
+
 
 [discrete]
 [[elser-v2]]
@@ -87,7 +90,7 @@ to download and deploy the model and you don't need to select from different
 versions. 
 
 If you want to learn more about the ELSER V2 improvements, refer to 
-https://www.elastic.co/search-labs/introducing-elser-v2-part-1[this blog post].
+https://www.elastic.co/search-labs/blog/introducing-elser-v2-part-1[this blog post].
 
 
 [discrete]
@@ -121,13 +124,18 @@ PUT _inference/sparse_embedding/my-elser-model
 {
   "service": "elser",
   "service_settings": {
-    "num_allocations": 1,
+    "adaptive_allocations": {
+      "enabled": true,
+      "min_number_of_allocations": 1,
+      "max_number_of_allocations": 10
+    },
     "num_threads": 1
   }
 }
 ----------------------------------
 --
 The API request automatically initiates the model download and then deploy the model.
+This example uses <<ml-nlp-auto-scale,autoscaling>> through adaptive allocation.
 
 Refer to the {ref}/infer-service-elser.html[ELSER {infer} service documentation] to learn more about the available settings.
 
@@ -381,7 +389,7 @@ following line to the `config/elasticsearch.yml` file:
 +
 --
 ```
-xpack.ml.model_repository: file://${path.home}/config/models/`
+xpack.ml.model_repository: file://${path.home}/config/models/
 ```
 --
 . Repeat step 2 and step 3 on all master-eligible nodes.
@@ -424,35 +432,51 @@ image::images/ml-nlp-elser-v2-test.png[alt="Testing ELSER",align="center"]
 [[performance]]
 == Performance considerations
 
-* ELSER works best on small-to-medium sized fields that contain natural 
-language. For connector or web crawler use cases, this aligns best with fields 
-like _title_, _description_, _summary_, or _abstract_. As ELSER encodes the 
-first 512 tokens of a field, it may not provide as relevant of results for large
-fields. For example, `body_content` on web crawler documents, or body fields 
-resulting from extracting text from office documents with connectors. For larger
-fields like these, consider "chunking" the content into multiple values, where
-each chunk can be under 512 tokens.
-* Larger documents take longer at ingestion time, and {infer} time per 
-document also increases the more fields in a document that need to be processed.
-* The more fields your pipeline has to perform inference on, the longer it takes 
-per document to ingest.
+* ELSER works best on small-to-medium sized fields that contain natural language.
+For connector or web crawler use cases, this aligns best with fields like _title_, _description_, _summary_, or _abstract_.
+As ELSER encodes the first 512 tokens of a field, it may not provide as relevant of results for large fields.
+For example, `body_content` on web crawler documents, or body fields resulting from extracting text from office documents with connectors.
+For larger fields like these, consider "chunking" the content into multiple values, where each chunk can be under 512 tokens.
+* Larger documents take longer at ingestion time, and {infer} time per document also increases the more fields in a document that need to be processed.
+* The more fields your pipeline has to perform inference on, the longer it takes per document to ingest.
 
 To learn more about ELSER performance, refer to the <<elser-benchmarks>>.
 
 
 [discrete]
-[[elser-adaptive-allocations]]
-== Adaptive allocations
+[[pre-cleaning]]
+== Pre-cleaning input text
+
+The quality of the input text significantly affects the quality of the embeddings.
+To achieve the best results, it's recommended to clean the input text before generating embeddings.
+The exact preprocessing you may need to do heavily depends on your text.
+For example, if your text contains HTML tags, use the {ref}/htmlstrip-processor.html[HTML strip processor] in an ingest pipeline to remove unnecessary elements.
+Always review and clean your input text before ingestion to eliminate any irrelevant entities that might affect the results.
+
+
+[discrete]
+[[elser-recommendations]]
+== Recommendations for using ELSER
+
+To gain the biggest value out of ELSER trained models, consider to follow this list of recommendations.
 
-include::ml-nlp-shared.asciidoc[tag=ml-nlp-adaptive-allocations]
+* If quick response time is important for your use case, keep {ml} resources available at all times by setting `min_allocations` to `1`.
+* Setting `min_allocations` to `0` can save on costs for non-critical use cases or testing environments.
+* Enabling <<ml-nlp-auto-scale,autoscaling>> through adaptive allocations or adaptive resources makes it possible for {es} to scale up or down the available resources of your ELSER deployment based on the load on the process.
+
+* Use dedicated, optimized ELSER {infer} endpoints for ingest and search use cases.
+** When deploying a trained model in {kib}, you can select for which case you want to optimize your ELSER deployment.
+** If you use the trained model or {infer} APIs and want to optimize your ELSER trained model deployment or {infer} endpoint for ingest, set the number of threads to `1` (`"num_threads": 1`).
+** If you use the trained model or {infer} APIs and want to optimize your ELSER trained model deployment or {infer} endpoint for search, set the number of threads to greater than `1`.
 
 
 [discrete]
 [[further-readings]]
 == Further reading
 
+* {ref}/semantic-search-semantic-text.html[Perform semantic search with `semantic_text` using the ELSER endpoint]
 * {ref}/semantic-search-elser.html[Perform semantic search with ELSER]
-* https://www.elastic.co/blog/may-2023-launch-information-retrieval-elasticsearch-ai-model[Improving information retrieval in the Elastic Stack: Introducing Elastic Learned Sparse Encoder, our new retrieval model]
+
 
 [discrete]
 [[elser-benchmarks]]
@@ -535,7 +559,7 @@ IMPORTANT: The length of the documents in your particular dataset will have a
 significant impact on your throughput numbers.
 
 Refer to 
-https://www.elastic.co/search-labs/introducing-elser-v2-part-1[this blog post] 
+https://www.elastic.co/search-labs/blog/introducing-elser-v2-part-1[this blog post] 
 to learn more about ELSER V2 improved performance.
 
 image::images/ml-nlp-elser-bm-summary.png[alt="Summary of ELSER V1 and V2 benchmark reports",align="center"]
diff --git a/docs/en/stack/ml/nlp/ml-nlp-limitations.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-limitations.asciidoc
index b1fb95ec7..e505bb63b 100644
--- a/docs/en/stack/ml/nlp/ml-nlp-limitations.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp-limitations.asciidoc
@@ -9,6 +9,12 @@
 The following limitations and known problems apply to the {version} release of 
 the Elastic {nlp} trained models feature.
 
+[discrete]
+[[ml-nlp-large-documents-limit-10k-10mb]]
+== Document size limitations when using `semantic_text` fields
+
+When using semantic text to ingest documents, chunking takes place automatically. The number of chunks is limited by the {ref}/mapping-settings-limit.html[`index.mapping.nested_objects.limit`] cluster setting, which defaults to 10k. Documents that are too large will cause errors during ingestion. To avoid this issue, please split your documents into roughly 1MB parts before ingestion.
+
 [discrete]
 [[ml-nlp-elser-v1-limit-512]]
 == ELSER semantic search is limited to 512 tokens per field that inference is applied to
@@ -18,15 +24,3 @@ each field of the ingested documents that ELSER is applied to are taken into
 account for the search process. If your data set contains long documents, divide 
 them into smaller segments before ingestion if you need the full text to be 
 searchable.
-
-
-[discrete]
-[[ml-nlp-elser-autoscale]]
-== ELSER deployments don't autoscale
-
-Currently, ELSER deployments do not scale up and down automatically depending on
-the resource requirements of the ELSER processes. If you want to configure
-available resources for your ELSER deployments, you can manually set the number
-of allocations and threads per allocation by using the Trained Models UI in
-{kib} or the 
-{ref}/update-trained-model-deployment.html[Update trained model deployment API].
\ No newline at end of file
diff --git a/docs/en/stack/ml/nlp/ml-nlp-model-ref.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-model-ref.asciidoc
index 1fd312198..dd664147b 100644
--- a/docs/en/stack/ml/nlp/ml-nlp-model-ref.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp-model-ref.asciidoc
@@ -26,6 +26,7 @@ The current list of supported architectures is:
 * BERT
 * BART
 * DPR bi-encoders
+* DeBERTa
 * DistilBERT
 * ELECTRA
 * MobileBERT
@@ -77,6 +78,16 @@ purposes and to get started with the Elastic {nlp} features.
 * https://huggingface.co/deepset/electra-base-squad2[Electra base squad2]
 * https://huggingface.co/deepset/tinyroberta-squad2[TinyRoBERTa squad2]
 
+[discrete]
+[[ml-nlp-model-ref-sparse-embedding]]
+== Third party sparse embedding models
+
+Sparse embedding models should be configured with the `text_expansion` task type.
+
+* https://huggingface.co/naver/splade-v3-distilbert[SPLADE-v3-DistilBERT]
+* https://huggingface.co/aken12/splade-japanese-v3[aken12/splade-japanese-v3]
+* https://huggingface.co/hotchpotch/japanese-splade-v2[hotchpotch/japanese-splade-v2]
+
 
 [discrete]
 [[ml-nlp-model-ref-text-embedding]]
@@ -148,8 +159,12 @@ Using `DPREncoderWrapper`:
 [discrete]
 [[ml-nlp-model-ref-text-similarity]]
 == Third party text similarity models
+
+You can use these text similarity models for {ref}/semantic-reranking.html#semantic-reranking-in-es[semantic re-ranking].
+
 * https://huggingface.co/cross-encoder/ms-marco-TinyBERT-L-2-v2[ms marco TinyBERT L2 v2]
 * https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2[ms marco MiniLM L6 v2]
+* https://huggingface.co/BAAI/bge-reranker-base[BAAI/bge-reranker-base]
 
 [discrete]
 [[ml-nlp-model-ref-zero-shot]]
diff --git a/docs/en/stack/ml/nlp/ml-nlp-shared.asciidoc b/docs/en/stack/ml/nlp/ml-nlp-shared.asciidoc
index 1e3948536..0568cda26 100644
--- a/docs/en/stack/ml/nlp/ml-nlp-shared.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp-shared.asciidoc
@@ -1,22 +1,3 @@
-tag::ml-nlp-adaptive-allocations[]
-The numbers of threads and allocations you can set manually for a model remain constant even when not all the available resources are fully used or when the load on the model requires more resources.
-Instead of setting the number of allocations manually, you can enable adaptive allocations to set the number of allocations based on the load on the process. This can help you to manage performance and cost more easily.
-When adaptive allocations are enabled, the number of allocations of the model is set automatically based on the current load.
-When the load is high, a new model allocation is automatically created.
-When the load is low, a model allocation is automatically removed.
-
-You can enable adaptive allocations by using:
-
-* the Create inference endpoint API for {ref}/infer-service-elser.html[ELSER], {ref}/infer-service-elasticsearch.html[E5 and models uploaded through Eland] that are used as {infer} services.
-* the {ref}/start-trained-model-deployment.html[start trained model deployment] or {ref}/update-trained-model-deployment.html[update trained model deployment] APIs for trained models that are deployed on {ml} nodes.
-
-If the new allocations fit on the current {ml} nodes, they are immediately started.
-If more resource capacity is needed for creating new model allocations, then your {ml} node will be scaled up if {ml} autoscaling is enabled to provide enough resources for the new allocation.
-The number of model allocations cannot be scaled down to less than 1.
-And they cannot be scaled up to more than 32 allocations, unless you explicitly set the maximum number of allocations to more.
-Adaptive allocations must be set up independently for each deployment and {infer} endpoint.
-end::ml-nlp-adaptive-allocations[]
-
 tag::nlp-eland-clone-docker-build[]
 You can use the {eland-docs}[Eland client] to install the {nlp} model. Use the prebuilt  
 Docker image to run the Eland install model commands. Pull the latest image with:
diff --git a/docs/en/stack/ml/nlp/ml-nlp.asciidoc b/docs/en/stack/ml/nlp/ml-nlp.asciidoc
index 18fe43747..92f161413 100644
--- a/docs/en/stack/ml/nlp/ml-nlp.asciidoc
+++ b/docs/en/stack/ml/nlp/ml-nlp.asciidoc
@@ -14,6 +14,7 @@ predictions.
 
 * <<ml-nlp-overview>>
 * <<ml-nlp-deploy-models>>
+* <<ml-nlp-auto-scale>>
 * <<ml-nlp-inference>>
 * <<ml-nlp-apis>>
 * <<ml-nlp-elser>>