If you are trying to read ORC format files hosted on hadoop cluster using PXF might result into blank / empty data.
Reference additional information section for unit test case.
Greenplum is insensitive to columns names if not put within double quotes and always defaults to all small caps for column names. So, if your ORC file column names are not matching the sensitivity of the columns names of table used for reading such a ORC file will result into blank/empty values.
Workaround -
Recreate the greenplum external readable table defined on specific ORC with column names in matching the ORC files column sensitivity within double quotes and that should should produce the actual data values for those columns.
Refer additional information section for unit test case.
Permanent Solution -
Fixed in PXF 6.11.3 onwards.
Unit test case to reproduce and validate the workaround
-- Creating an ORC file
drop external table if exists testing.export_to_orc;
CREATE WRITABLE EXTERNAL TABLE testing.export_to_orc ( "COL01" INT, "COL02" TEXT, "COL03" TEXT, "COL04" TEXT, "COL05" TEXT)
LOCATION ('pxf://data/testing?PROFILE=hdfs:orc&SERVER=testing')
FORMAT 'CUSTOM' (FORMATTER='pxfwritable_export');
insert into testing.export_to_orc values (1 , 'col2data1' , 'col3data1' , 'col4data1' , 'col5data1');
insert into testing.export_to_orc values (5 , 'col2data5' , 'col3data5' , 'col4data5' , 'col5data5');
insert into testing.export_to_orc values (3 , 'col2data3' , 'col3data3' , 'col4data3' , 'col5data3');
insert into testing.export_to_orc values (7 , 'col2data7' , 'col3data7' , 'col4data7' , 'col5data7');
insert into testing.export_to_orc values (4 , 'col2data4' , 'col3data4' , 'col4data4' , 'col5data4');
insert into testing.export_to_orc values (8 , 'col2data8' , 'col3data8' , 'col4data8' , 'col5data8');
insert into testing.export_to_orc values (2 , 'col2data2' , 'col3data2' , 'col4data2' , 'col5data2');
insert into testing.export_to_orc values (6 , 'col2data6' , 'col3data6' , 'col4data6' , 'col5data6');
-- Reading from ORC with variation of column names in terms of case sensitivity
drop external table if exists testing.read_from_orc;
CREATE EXTERNAL TABLE testing.read_from_orc ( "COL01" INT, "CoL02" TEXT, "col03" TEXT, "COL04" TEXT, "COL05" TEXT)
LOCATION ('pxf://data/testing?PROFILE=hdfs:orc&SERVER=testing')
FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import');
select * from testing.read_from_orc;
-- Workaround by matching the original ORC column sensitivity - which is all caps for COL2 and COL3 columns in GP read table.
drop external table if exists testing.read_from_orc_wrkarnd;
CREATE EXTERNAL TABLE testing.read_from_orc_wrkarnd ( "COL01" INT, "COL02" TEXT, "COL03" TEXT, "COL04" TEXT, "COL05" TEXT)
LOCATION ('pxf://data/testing?PROFILE=hdfs:orc&SERVER=testing')
FORMAT 'CUSTOM' (FORMATTER='pxfwritable_import');
select * from testing.read_from_orc_wrkarnd;
-- outputs
gpadmin=# select count(*) from testing.read_from_orc;
count
-------
8
(1 row)
-- problematic output
gpadmin=# select * from testing.read_from_orc;
COL01 | CoL02 | col03 | COL04 | COL05
-------+-------+-------+-----------+-----------
3 | | | col4data3 | col5data3
2 | | | col4data2 | col5data2
5 | | | col4data5 | col5data5
8 | | | col4data8 | col5data8
1 | | | col4data1 | col5data1
4 | | | col4data4 | col5data4
7 | | | col4data7 | col5data7
6 | | | col4data6 | col5data6
(8 rows)
gpadmin=#
-- workaround
gpadmin=# select * from testing.read_from_orc_wrkarnd;
COL01 | COL02 | COL03 | COL04 | COL05
-------+-----------+-----------+-----------+-----------
1 | col2data1 | col3data1 | col4data1 | col5data1
4 | col2data4 | col3data4 | col4data4 | col5data4
3 | col2data3 | col3data3 | col4data3 | col5data3
2 | col2data2 | col3data2 | col4data2 | col5data2
7 | col2data7 | col3data7 | col4data7 | col5data7
6 | col2data6 | col3data6 | col4data6 | col5data6
5 | col2data5 | col3data5 | col4data5 | col5data5
8 | col2data8 | col3data8 | col4data8 | col5data8
(8 rows)
gpadmin=#