英文:
Using parquet-avro library in Java to read parquet file written using pyarrow
问题
I am writing a dataframe to parquet using pyarrow in python.
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
df = pd.DataFrame(
{
"numbers": [1, 2, 3],
"colors": ["red", "white", "blue"],
"dates":["2019-12-16", "2019-12-16", "2019-12-16"],
"codes": [None, None, None]
}
)
table = pa.Table.from_pandas(df)
pq.write_table(table, "filename")
Parquet file when read in Java (or in Sublime Text configured with parquet-tools) is:
{"numbers":1,"codes":"R1"}
{"numbers":2,"codes":"G1"}
{"numbers":3,"codes":"B1"}
The code I am using to read the parquet is this:
class Parquet {
private List<SimpleGroup> data;
private List<Type> schema;
public Parquet(List<SimpleGroup> data, List<Type> schema) {
this.data = data;
this.schema = schema;
}
public List<SimpleGroup> getData() {
return data;
}
public List<Type> getSchema() {
return schema;
}
}
public static Parquet getParquetData(String filePath) throws IOException {
List<SimpleGroup> simpleGroups = new ArrayList<>();
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()));
MessageType schema = reader.getFooter().getFileMetaData().getSchema();
List<Type> fields = schema.getFields();
PageReadStore pages;
while ((pages = reader.readNextRowGroup()) != null) {
long rows = pages.getRowCount();
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
SimpleGroup simpleGroup = (SimpleGroup) recordReader.read();
simpleGroups.add(simpleGroup);
}
}
reader.close();
return new Parquet(simpleGroups, fields);
}
}
While debugging I found that though schema has all the columns, in data we see only non-null columns.
Has anybody seen this behavior? Is there any option in the parquet-avro library to not have this 'optimization'?
Thanks
英文:
I am writing a dataframe to parquet using pyarrow in python.
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
df = pd.DataFrame(
{
"numbers": [1, 2, 3],
"colors": ["red", "white", "blue"],
"dates":['2019-12-16', '2019-12-16', '2019-12-16'],
"codes": [None, None, None]
}
)
table = pa.Table.from_pandas(df)
pq.write_table(table, "filename")
Parquet file when read in Java (or in Sublime Text configured with parquet-tools) is:
{"numbers":1,"codes":"R1"}
{"numbers":2,"codes":"G1"}
{"numbers":3,"codes":"B1"}
The code I am using to read the parquet is this:
class Parquet {
private List<SimpleGroup> data;
private List<Type> schema;
public Parquet(List<SimpleGroup> data, List<Type> schema) {
this.data = data;
this.schema = schema;
}
public List<SimpleGroup> getData() {
return data;
}
public List<Type> getSchema() {
return schema;
}
}
public static Parquet getParquetData(String filePath) throws IOException {
List<SimpleGroup> simpleGroups = new ArrayList<>();
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()));
MessageType schema = reader.getFooter().getFileMetaData().getSchema();
List<Type> fields = schema.getFields();
PageReadStore pages;
while ((pages = reader.readNextRowGroup()) != null) {
long rows = pages.getRowCount();
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
SimpleGroup simpleGroup = (SimpleGroup) recordReader.read();
simpleGroups.add(simpleGroup);
}
}
reader.close();
return new Parquet(simpleGroups, fields);
}
}
While debugging I found that though schema has all the columns, in data we see only non-null columns.
Has anybody seen this behavior? Is there any option in the parquet-avro library to not have this 'optimization'?
Thanks
专注分享java语言的经验与见解,让所有开发者获益!
评论