Ray Data#

Vortex provides a Ray Data datasource for reading Vortex files in distributed Ray pipelines.

>>> import vortex as vx
>>> import pyarrow.parquet as pq
>>> import os
>>> os.makedirs("ray_data", exist_ok=True)
>>> table = pq.read_table("_static/example.parquet")
>>> vx.io.write(table, 'ray_data/example-01.vortex')
>>> vx.io.write(table, 'ray_data/example-02.vortex')
>>> vx.io.write(table, 'ray_data/example-03.vortex')
>>>
>>> from vortex.ray.datasource import VortexDatasource
>>> from ray.data import read_datasource
>>>
>>> ds = read_datasource(VortexDatasource(url='ray_data'))
>>> ds.to_pandas()
      VendorID tpep_pickup_datetime  ... congestion_surcharge  Airport_fee
0            1  2023-11-01 00:03:03  ...                  0.0         1.75
1            1  2023-11-01 00:03:28  ...                  2.5         0.00
2            2  2023-10-31 23:58:05  ...                  2.5         1.75
3            2  2023-11-01 00:03:50  ...                  2.5         0.00
4            2  2023-11-01 00:06:30  ...                  2.5         0.00
...        ...                  ...  ...                  ...          ...
2995         1  2023-11-01 00:09:20  ...                  2.5         0.00
2996         2  2023-11-01 00:16:03  ...                  2.5         0.00
2997         2  2023-11-01 00:32:42  ...                  2.5         0.00
2998         1  2023-11-01 00:04:52  ...                  2.5         0.00
2999         1  2023-11-01 00:18:56  ...                  2.5         0.00

[3000 rows x 19 columns]