0%

parquet文件统计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
public static Map<String, Long> statisticsParquet(String tablePath) throws IOException {
Map<String, Long> map = new HashMap<>(3);
Long rows = 0L;
Long columns = 0L;
Long fileSize = 0L;

FileSystem fs = null;
ParquetFileReader parquetFileReader = null;
try {
fs = FileSystem.get(HdfsUtils.getConfiguration());
Path parquetFile;
boolean isFirst = true;

RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path(tablePath), true);
while (listFiles.hasNext()) {
LocatedFileStatus fileStatus = listFiles.next();
if (fileStatus.isFile() && fileStatus.getPath().toString().toLowerCase().endsWith(".parquet")) {

parquetFile = fileStatus.getPath();
fileSize += fileStatus.getLen();

parquetFileReader = new ParquetFileReader(HdfsUtils.getConfiguration(), parquetFile,
ParquetMetadataConverter.NO_FILTER);
rows += parquetFileReader.getRecordCount();

if (isFirst) {
columns = (long) parquetFileReader.getFileMetaData().getSchema().getFieldCount();
isFirst = false;
}

parquetFileReader.close();
}
}

} finally {
if (fs != null) {
try {
fs.close();
} catch (IOException e) {
logger.error("FileSystem close ", e);
}
}
if (parquetFileReader != null) {
try {
parquetFileReader.close();
} catch (IOException e) {
logger.error(e.getLocalizedMessage(), e);
}
}
}

map.put("rows", rows);
map.put("columns", columns);
map.put("fileSize", fileSize);
return map;
}