DataX_HAWQ_安装&&使用

2020/09/04 HAWQ 共 1781 字,约 6 分钟

参考

DataX_HashData

OS 环境

CentOS Linux release 7.4.1708 (Core)

HAWQ版本

PostgreSQL 8.2.15 (Greenplum Database 4.2.0 build 1) 
(HAWQ 2.4.0.0 build dev) on x86_64-unknown-linux-gnu, 
compiled by GCC gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-39) 
compiled on Mar 28 2020 17:33:50

安装dataX

mkdir -p /home/gpadmin/dataX/
cd /home/gpadmin/dataX/
wget https://github.com/HashDataInc/DataX/archive/v1.0.4.zip

unzip v1.0.4.zip
cd /home/gpadmin/dataX/DataX-1.0.4

# 源码编译
mvn -U clean package assembly:assembly -Dmaven.test.skip=true

cd /home/gpadmin/dataX/DataX-1.0.4/target/
mv datax-v1.0.4-hashdata /opt/

DB中表准备

create table s(id int,name text);INSERT INTO S select generate_series(1,1000),'您好qwer!@#$%^&*()';
create table d(id int,name text);

--## 如果是HAWQ请在DB中执行下列函数,如果是greenplum或hashdata请跳过该步骤
CREATE FUNCTION gp_truncate_error_log(name text,out res int) AS $$ SELECT 0 $$ LANGUAGE SQL;

找到datax

cd /opt/datax-v1.0.4-hashdata/datax/bin/

生成postgresql to hawq 的导数模版

对应参数说明文档

postgresqlreader
gpdbwriter


python datax.py -r postgresqlreader -w gpdbwriter >pg2hawq.json.tamplate

制作导致json文件

{
    "job": {
        "content": [
            {
                "reader": {
                    "name": "postgresqlreader",
                    "parameter": {
                        "column": [ "id","name" ],
                        "connection": [
                            {
                                "jdbcUrl": [ "jdbc:postgresql://127.0.0.1:5432/test" ],
                                "table":  [ "public.s" ]
                            }
                        ],
                        "password": "gpadmin",
                        "username": "gpadmin"
                    }
                },
                "writer": {
                    "name": "gpdbwriter",
                    "parameter": {
                        "column": [ "id","name" ],
                        "connection": [
                            {
                                "jdbcUrl": "jdbc:postgresql://127.0.0.1:5432/test",
                                "table": [ "public.d" ]
                            }
                        ],
                        "preSql": [ "select 'beginTime: '||now();truncate table d" ],
                        "postSql": [ "select 'endTime: '||now() " ],
                        "segment_reject_limit": 0,
                        "username": "gpadmin",
                        "password": "gpadmin"
                    }
                }
            }
        ],
        "setting": {
            "speed": {
                "channel": "1"
            }
        }
    }
}

执行job

python datax.py pg2hawq.json

列出GpdbWriter针对PostgreSQL类型转换列表

DataX 内部类型PostgreSQL 数据类型
Longbigint, bigserial, integer, smallint, serial
Doubledouble precision, money, numeric, real
Stringvarchar, char, text, bit
Datedate, time, timestamp
Booleanbool
Bytesbytea

文档信息

Search

    Table of Contents