Rust Quickstart¶
Install¶
Install vortex and all the first-party array encodings:
cargo add vortex
Convert¶
You can either use your own Parquet file or download the example used here.
Use Arrow to read a Parquet file and then construct an uncompressed Vortex array:
use std::fs::File;
use arrow_array::RecordBatchReader;
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
use vortex::array::ChunkedArray;
use vortex::arrow::FromArrowType;
use vortex::{Array, IntoArray};
use vortex::dtype::DType;
let reader =
ParquetRecordBatchReaderBuilder::try_new(File::open("_static/example.parquet").unwrap())
.unwrap()
.build()
.unwrap();
let dtype = DType::from_arrow(reader.schema());
let chunks = reader
.map(|x| Array::try_from(x.unwrap()).unwrap())
.collect::<Vec<_>>();
let vtx = ChunkedArray::try_new(chunks, dtype).unwrap().into_array();
Compress¶
Use the sampling compressor to compress the Vortex array and check the relative size:
use std::collections::HashSet;
use vortex::sampling_compressor::{SamplingCompressor, DEFAULT_COMPRESSORS};
let compressor = SamplingCompressor::new(HashSet::from(*DEFAULT_COMPRESSORS));
let cvtx = compressor.compress(&vtx, None).unwrap().into_array();
println!("{}", cvtx.nbytes());
Write¶
Reading and writing both require an async runtime; in this example we use Tokio. The VortexFileWriter knows how to write Vortex arrays to disk:
use std::path::Path;
use tokio::fs::File as TokioFile;
use vortex_serde::file::write::writer::VortexFileWriter;
let file = TokioFile::create(Path::new("example.vortex"))
.await
.unwrap();
let writer = VortexFileWriter::new(file)
.write_array_columns(cvtx.clone())
.await
.unwrap();
writer.finalize().await.unwrap();
Read¶
use futures::TryStreamExt;
use vortex::sampling_compressor::ALL_COMPRESSORS_CONTEXT;
use vortex_serde::file::read::builder::{VortexReadBuilder, LayoutDeserializer};
let file = TokioFile::open(Path::new("example.vortex")).await.unwrap();
let builder = VortexReadBuilder::new(
file,
LayoutDeserializer::new(
ALL_COMPRESSORS_CONTEXT.clone(),
LayoutContext::default().into(),
),
);
let stream = builder.build().await.unwrap();
let dtype = stream.schema().clone().into();
let vecs: Vec<Array> = stream.try_collect().await.unwrap();
let cvtx = ChunkedArray::try_new(vecs, dtype)
.unwrap()
.into_array();
println!("{}", cvtx.nbytes());