Ray Data#
Vortex provides a Ray Data datasource for reading Vortex files in distributed Ray pipelines.
>>> import vortex as vx
>>> import pyarrow.parquet as pq
>>> import os
>>> os.makedirs("ray_data", exist_ok=True)
>>> table = pq.read_table("_static/example.parquet")
>>> vx.io.write(table, 'ray_data/example-01.vortex')
>>> vx.io.write(table, 'ray_data/example-02.vortex')
>>> vx.io.write(table, 'ray_data/example-03.vortex')
>>>
>>> from vortex.ray.datasource import VortexDatasource
>>> from ray.data import read_datasource
>>>
>>> ds = read_datasource(VortexDatasource(url='ray_data'))
>>> ds.to_pandas()
VendorID tpep_pickup_datetime ... congestion_surcharge Airport_fee
0 1 2023-11-01 00:03:03 ... 0.0 1.75
1 1 2023-11-01 00:03:28 ... 2.5 0.00
2 2 2023-10-31 23:58:05 ... 2.5 1.75
3 2 2023-11-01 00:03:50 ... 2.5 0.00
4 2 2023-11-01 00:06:30 ... 2.5 0.00
... ... ... ... ... ...
2995 1 2023-11-01 00:09:20 ... 2.5 0.00
2996 2 2023-11-01 00:16:03 ... 2.5 0.00
2997 2 2023-11-01 00:32:42 ... 2.5 0.00
2998 1 2023-11-01 00:04:52 ... 2.5 0.00
2999 1 2023-11-01 00:18:56 ... 2.5 0.00
[3000 rows x 19 columns]