Sqoop+Hive修改误差数据

需求

现在发现职业分析数据库中,各大城市中工作1-3的大数据人员的薪资出现偏差,偏差值为2000, 修改该偏差值,并更新RBDMS中。

创建textfile hive表

create table work.job_info_textfile(
key string,
salary float
) 
row format delimited fields terminated by '\t';

将误差数据导入到hive中

sqoop import \
--connect jdbc:mysql://hadoop1:3306/test \
--username root \
--password 123456 \
--table recruitment \
--columns 'id, salary' \
--where ' work_experience>=1 and work_experience<=3' \
--delete-target-dir \
--fields-terminated-by '\t' \
--direct \
--hive-import \
--hive-database work \
--hive-table job_info_textfile

创建orc 更新ACID表

create table work.job_info_orc(
key string,
salary float
) 
clustered by (key) into 2 buckets
row format delimited fields terminated by '\t'
stored as ORC tblproperties ('transactional'='true');

将误差数据导入orc格式的表中

insert into table work.job_info_orc
select key, salary
from work.job_info_textfile;

将误差修正

update work.job_info_orc SET salary = salary-2000;

将修正好的数据,导致textfile中

insert overwrite table work.job_info_textfile
select key, salary from work.job_info_orc;

将mysql中的误差数据更新

sqoop export \
--connect jdbc:mysql://hadoop1:3306/test \
--username root \
--password 123456 \
--driver com.mysql.jdbc.Driver \
--columns 'id,salary' \
--num-mappers 5\
--table recruitment \
--fields-terminated-by '\t' \
--export-dir /user/hive/warehouse/work.db/job_info_textfile \
--update-key 'id' \
--direct