AI-Hypercomputer · csgoogle · May 15, 2026 · github-actions · May 15, 2026 · github-actions
@@ -95,3 +95,15 @@
     [CROSS_ATTN_Q_LENGTH, CONTEXT],
     [CROSS_ATTN_KV_LENGTH, CONTEXT],
 ]
+
+### Common axis rules for 2D Ulysses + ring attention ###
+# Public configs shard sequence on `context`; attention code privately reshapes
+# that axis into hidden ring and Ulysses axes for the hybrid kernel.
+ULYSSES_RING_ATTENTION_AXIS_RULES = [
+    [SELF_ATTN_HEAD, None],
+    [SELF_ATTN_Q_LENGTH, CONTEXT],
+    [SELF_ATTN_KV_LENGTH, CONTEXT],
+    [CROSS_ATTN_HEAD, None],
+    [CROSS_ATTN_Q_LENGTH, CONTEXT],
+    [CROSS_ATTN_KV_LENGTH, None],
-    [CROSS_ATTN_KV_LENGTH, None],
+    [CROSS_ATTN_KV_LENGTH, None],
-    [CROSS_ATTN_KV_LENGTH, None],
+    [CROSS_ATTN_KV_LENGTH, None],
+]
@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 

@@ -60,9 +60,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te, ring, ulysses
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 0
 
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.

@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 
@@ -81,14 +83,14 @@ mask_padding_tokens: True
 attention_sharding_uniform: True
 
 flash_block_sizes: {
-  "block_q" : 512,
-  "block_kv_compute" : 512,
-  "block_kv" : 512,
-  "block_q_dkv" : 512,
-  "block_kv_dkv" : 512,
-  "block_kv_dkv_compute" : 512,
-  "block_q_dq" : 512,
-  "block_kv_dq" : 512,
+  "block_q" : 2048,
+  "block_kv_compute" : 1024,
+  "block_kv" : 2048,
+  "block_q_dkv" : 2048,
+  "block_kv_dkv" : 2048,
+  "block_kv_dkv_compute" : 1024,
+  "block_q_dq" : 2048,
+  "block_kv_dq" : 2048,
   "use_fused_bwd_kernel": False,
 }
 # Use on v6e

@@ -62,9 +62,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 # If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
 # Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.

@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 

@@ -64,9 +64,11 @@ jit_initializers: True
 # Set true to load weights from pytorch
 from_pt: True
 split_head_dim: True
-attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom
+attention: 'flash' # Supported attention: dot_product, flash, tokamax_flash, cudnn_flash_te, ring, tokamax_ring, ulysses, ulysses_custom, ulysses_ring
 use_base2_exp: True
 use_experimental_scheduler: True
+# For attention=ulysses_ring, hidden Ulysses shard count; ring shards are context / this.
+ulysses_shards: -1
 flash_min_seq_length: 4096
 dropout: 0.0
 

@@ -617,7 +617,7 @@ def get_flash_block_sizes(config):
   """Create custom flash attention BlockSizes."""
   flash_block_sizes = None
   if len(config.flash_block_sizes.keys()) > 0:
-    attention_is_tokamax = "tokamax" in config.attention
+    attention_is_tokamax = "tokamax" in config.attention or config.attention == "ulysses_ring"
     user_block_sizes: Dict[str, int] = config.flash_block_sizes
     if attention_is_tokamax:
       max_logging.log(