impalaSide.md

May 21, 2020 · View on GitHub

1.格式:

 CREATE TABLE tableName(
     colName cloType,
     ...
     PRIMARY KEY(keyInfo),
     PERIOD FOR SYSTEM_TIME
  )WITH(
     type='impala',
     url='jdbcUrl',
     userName='dbUserName',
     password='dbPwd',
     tableName='tableName',
     cache ='LRU',
     cacheSize ='10000',
     cacheTTLMs ='60000',
     parallelism ='1',
     partitionedJoin='false'
  );

2.支持版本

2.10.0-cdh5.13.0

3.表结构定义

维表参数信息

impala独有的参数配置

参数名称含义是否必填默认值
type表明维表的类型[impala]
url连接postgresql数据库 jdbcUrl
userNamepostgresql连接用户名
passwordpostgresql连接密码
tableNamepostgresql表名称
authMech身份验证机制 (0, 1, 2, 3), 暂不支持kerberos0
principalkerberos用于登录的principal(authMech=1时独有)authMech=1为必填
keyTabFilePathkeytab文件的路径(authMech=1时独有)authMech=1为必填
krb5FilePathkrb5.conf文件路径(authMech=1时独有)authMech=1为必填
krbServiceNameImpala服务器的Kerberos principal名称(authMech=1时独有)authMech=1为必填
krbRealmKerberos的域名(authMech=1时独有)HADOOP.COM
enablePartition是否支持分区false
partitionfields分区字段名否,enablePartition='true'时为必填
partitionFieldTypes分区字段类型否,enablePartition='true'时为必填
partitionValues分区值否,enablePartition='true'时为必填
cache维表缓存策略(NONE/LRU/ALL)NONE
partitionedJoin是否在維表join之前先根据 設定的key 做一次keyby操作(可以減少维表的数据缓存量)false

4.样例

ALL全量维表定义

 // 定义全量维表
CREATE TABLE sideTable(
    id INT,
    name VARCHAR,
    PRIMARY KEY(id) ,
    PERIOD FOR SYSTEM_TIME
 )WITH(
    type ='mysql',
    url ='jdbc:impala://localhost:21050/mqtest',
    userName ='dtstack',
    password ='1abc123',
    tableName ='test_impala_all',
    authMech='3',
    cache ='ALL',
    cacheTTLMs ='60000',
    parallelism ='2',
    partitionedJoin='false'
 );

LRU异步维表定义

create table sideTable(
    channel varchar,
    xccount int,
    PRIMARY KEY(channel),
    PERIOD FOR SYSTEM_TIME
 )WITH(
    type='impala',
    url='jdbc:impala://localhost:21050/mytest',
    userName='dtstack',
    password='abc123',
    tableName='sidetest',
    authMech='3',
    cache ='LRU',
    cacheSize ='10000',
    cacheTTLMs ='60000',
    parallelism ='1',
    partitionedJoin='false'
 );

MySQL异步维表关联

CREATE TABLE MyTable(
    id int,
    name varchar
 )WITH(
    type ='kafka11',
    bootstrapServers ='172.16.8.107:9092',
    zookeeperQuorum ='172.16.8.107:2181/kafka',
    offsetReset ='latest',
    topic ='cannan_yctest01',
    timezone='Asia/Shanghai',
    enableKeyPartitions ='false',
    topicIsPattern ='false',
    parallelism ='1'
 );

CREATE TABLE MyResult(
    id INT,
    name VARCHAR
 )WITH(
    type='impala',
    url='jdbc:impala://localhost:21050/mytest',
    userName='dtstack',
    password='abc123',
    tableName ='test_impala_zf',
    updateMode ='append',
    parallelism ='1',
    batchSize ='100',
    batchWaitInterval ='1000'
 );

CREATE TABLE sideTable(
    id INT,
    name VARCHAR,
    PRIMARY KEY(id) ,
    PERIOD FOR SYSTEM_TIME
 )WITH(
    type='impala',
    url='jdbc:impala://localhost:21050/mytest',
    userName='dtstack',
    password='abc123',
    tableName ='test_impala_10',
    partitionedJoin ='false',
    cache ='LRU',
    cacheSize ='10000',
    cacheTTLMs ='60000',
    asyncPoolSize ='3',
    parallelism ='1'
 );

insert   
into
    MyResult
    select
        m.id,
        s.name     
    from
        MyTable  m    
    join
        sideTable s             
            on m.id=s.id;

分区样例

注:分区字段放在最后面,如下,name是分区字段,放在channel,xccount字段的后面

create table sideTable(
    channel varchar,
    xccount int,
    name varchar,
    PRIMARY KEY(channel),
    PERIOD FOR SYSTEM_TIME
 )WITH(
    type='impala',
    url='jdbc:impala://localhost:21050/mytest',
    userName='dtstack',
    password='abc123',
    tableName='sidetest',
    authMech='3',
    cache ='LRU',
    cacheSize ='10000',
    cacheTTLMs ='60000',
    parallelism ='1',
    enablePartition='true',
    partitionfields='name',
    partitionFieldTypes='varchar',
    partitionValues='{"name":["tom","jeck"]}',
    partitionedJoin='false'
 );