Hudi Streamer 数据导入示例 TPCH

2022-09-19 warehouse hudi

Nation

----- 建议通过 Spark-SQL 执行
DROP TABLE IF EXISTS nation;
CREATE TABLE nation (
    n_nationkey INT NOT NULL,
    n_name      VARCHAR(25) NOT NULL,
    n_regionkey INT NOT NULL,
    n_comment   VARCHAR(256)
)
USING hudi
OPTIONS (
    type = 'cow',
    primaryKey = 'n_nationkey'
) LOCATION 'file:///svr/data/hudi/tcph/nation';

----- Hudi相关配置项,也可以在如下命令行中通过 --hoodie-conf 配置
# cat /svr/hudi/tpch/nation.properties 
hoodie.embed.timeline.server=false
hoodie.datasource.write.recordkey.field=n_nationkey
hoodie.streamer.source.dfs.root=file:///svr/hudi/tpch/nation/
hoodie.streamer.csv.sep=|
hoodie.streamer.csv.header=false
hoodie.streamer.schemaprovider.target.schema.file=file:///svr/hudi/tpch/nation.avsc
hoodie.streamer.schemaprovider.source.schema.file=file:///svr/hudi/tpch/nation.avsc

----- 表结构信息
# cat /svr/hudi/tpch/nation.avsc
{
    "type":"record",
    "name":"nation",
    "fields":[{
        "name":"n_nationkey",
        "type":"int"
    }, {
        "name":"n_name",
        "type":"string"
    }, {
        "name":"n_regionkey",
        "type":"int"
    }, {
        "name":"n_comment",
        "type":"string"
    }]
}

----- 执行数据导入
# spark-submit --master local[2] --deploy-mode client --driver-memory 20g \
    --class org.apache.hudi.utilities.streamer.HoodieStreamer \
    /svr/hudi/hudi-utilities-bundle_2.12-0.14.1.jar \
    --props file:///svr/hudi/tpch/nation.properties \
    --source-class org.apache.hudi.utilities.sources.CsvDFSSource \
    --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
    --table-type COPY_ON_WRITE --op BULK_INSERT \
    --target-base-path file:///svr/data/hudi/tcph/nation \
    --target-table tpch.nation