googleapis · GarrettWu · Nov 19, 2025 · Nov 20, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -110,6 +110,7 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
     joined_df = dfs[0]
     for df in dfs[1:]:
         joined_df = joined_df.join(df, how="outer")
+    joined_df = joined_df.cache()
     if stratify is None:
         joined_df_train, joined_df_test = joined_df._split(
             fracs=(train_size, test_size), random_state=random_state
@@ -120,8 +121,8 @@ def _stratify_split(df: bpd.DataFrame, stratify: bpd.Series) -> List[bpd.DataFra
     results = []
     for array in arrays:
         columns = array.name if isinstance(array, bpd.Series) else array.columns
-        results.append(joined_df_train[columns])
-        results.append(joined_df_test[columns])
+        results.append(joined_df_train[columns].cache())
+        results.append(joined_df_test[columns].cache())
 block, string_ordering_col = block.apply_unary_op( 
     ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE) 
 ) 
 # Apply hash method to sum col and order by it. 
 block, string_sum_col = block.apply_binary_op( 
     string_ordering_col, random_state_col, ops.strconcat_op 
 ) 
 block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) 
 block = block.order_by( 
     [ordering.OrderingExpression(ex.deref(hash_string_sum_col))] 
 ) 
 block, string_ordering_col = block.apply_unary_op( 
     ordering_col, ops.AsTypeOp(to_type=bigframes.dtypes.STRING_DTYPE) 
 ) 
  
 # Apply hash method to sum col and order by it. 
 block, string_sum_col = block.apply_binary_op( 
     string_ordering_col, random_state_col, ops.strconcat_op 
 ) 
 block, hash_string_sum_col = block.apply_unary_op(string_sum_col, ops.hash_op) 
 block = block.order_by( 
     [ordering.OrderingExpression(ex.deref(hash_string_sum_col))] 
 ) 
  
 
     return results
 

@@ -46,6 +46,24 @@ def test_train_test_split_default_correct_shape(df_fixture, request):
     assert y_test.shape == (86, 1)
 
 
+def test_train_test_split_default_unordered_same_index(
+    unordered_session, penguins_pandas_df_default_index
+):
+    df = unordered_session.read_pandas(penguins_pandas_df_default_index)
+    X = df[
+        [
+            "species",
+            "island",
+            "culmen_length_mm",
+        ]
+    ]
+    y = df[["body_mass_g"]]
+    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)
+
+    pd.testing.assert_index_equal(X_train.to_pandas().index, y_train.to_pandas().index)
+    pd.testing.assert_index_equal(X_test.to_pandas().index, y_test.to_pandas().index)
+
+
 def test_train_test_split_series_default_correct_shape(penguins_df_default_index):
     X = penguins_df_default_index[["species"]]
     y = penguins_df_default_index["body_mass_g"]