采集配置
{
"name": "avro-inventory-customers",
"config": {
"connector.class": "io.debezium.connector.mysql.MySqlConnector",
"tasks.max": "1",
"database.hostname": "psd-hadoop039",
"database.port": "3306",
"database.user": "debezium",
"database.password": "dbz",
"database.server.name": "avro",
"database.whitelist": "inventory",
"database.history.kafka.bootstrap.servers": "psd-hadoop039:9092",
"database.history.kafka.topic": "dbhistory.avro.inventory",
"table.whitelist": "inventory.customers",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"key.converter.schema.registry.url": "http://psd-hadoop039:8081",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://psd-hadoop039:8081",
"include.schema.changes": "true",
"tombstones.on.delete":"false",
"binary.handling.mode": "hex",
"time.precision.mode": "connect"
}
}
消费配置
{
"name": "avro.inventory.customers.sink",
"config": {
"connector.class": "io.confluent.connect.hdfs.HdfsSinkConnector",
"format.class": "io.confluent.connect.hdfs.avro.AvroFormat",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"key.converter.schema.registry.url": "http://psd-hadoop039:8081",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://psd-hadoop039:8081",
"key.converter.schemas.enable": "true",
"value.converter.schemas.enable": "true",
"tasks.max": "1",
"topics": "avro.inventory.customers",
"transforms": "unwrap",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
"transforms.unwrap.drop.tombstones": "true",
"transforms.unwrap.delete.handling.mode": "rewrite",
"transforms.unwrap.add.headers": "name,db,table,op,db",
"transforms.unwrap.add.fields": "name,db,table,op,file,pos,row,ts_ms,source.ts_ms",
"hadoop.conf.dir": "/etc/hadoop/conf",
"store.url": "hdfs://cdhtest",
"logs.dir": "/user/dts/logs",
"topics.dir": "/user/dts/topics",
"flush.size": "2",
"rotate.interval.ms": "10000",
"hive.integration": true,
"hive.database": "dts",
"hive.metastore.uris": "thrift://cdh-10-21-17-95:9083",
"partitioner.class": "io.confluent.connect.hdfs.partitioner.HourlyPartitioner",
"locale": "zh",
"timezone": "Asia/Shanghai",
"path.format": "YYYYMMddHH/",
"schema.compatibility": "BACKWARD"
}
}
数据=》新增、修改:ok;删除报错如下
错误信息
[2020-08-05 10:23:11,028] ERROR WorkerSinkTask{id=avro.inventory.customers.sink-0} Task threw an uncaught and unrecoverable exception. Task is being killed and will not recover until manually restarted. Error: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility (org.apache.kafka.connect.runtime.WorkerSinkTask:565)
java.lang.RuntimeException: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:407)
at io.confluent.connect.hdfs.DataWriter.write(DataWriter.java:386)
at io.confluent.connect.hdfs.HdfsSinkTask.put(HdfsSinkTask.java:127)
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:545)
at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:325)
at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:228)
at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:200)
at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.validateAndCheck(StorageSchemaCompatibility.java:157)
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.shouldChangeSchema(StorageSchemaCompatibility.java:320)
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:361)
... 13 more
[2020-08-05 10:23:11,039] ERROR WorkerSinkTask{id=avro.inventory.customers.sink-0} Task threw an uncaught and unrecoverable exception (org.apache.kafka.connect.runtime.WorkerTask:186)
org.apache.kafka.connect.errors.ConnectException: Exiting WorkerSinkTask due to unrecoverable exception.
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:567)
at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:325)
at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:228)
at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:200)
at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:407)
at io.confluent.connect.hdfs.DataWriter.write(DataWriter.java:386)
at io.confluent.connect.hdfs.HdfsSinkTask.put(HdfsSinkTask.java:127)
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:545)
... 10 more
Caused by: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.validateAndCheck(StorageSchemaCompatibility.java:157)
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.shouldChangeSchema(StorageSchemaCompatibility.java:320)
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:361)
... 13 more
字段=》新增:报错如下
[2020-08-05 11:04:05,040] ERROR WorkerSinkTask{id=avro.inventory.customers.sink-0} Task threw an uncaught and unrecoverable exception. Task is being killed and will not recover until manually restarted. Error: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility (org.apache.kafka.connect.runtime.WorkerSinkTask:565)
java.lang.RuntimeException: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:407)
at io.confluent.connect.hdfs.DataWriter.write(DataWriter.java:386)
at io.confluent.connect.hdfs.HdfsSinkTask.put(HdfsSinkTask.java:127)
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:545)
at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:325)
at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:228)
at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:200)
at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.validateAndCheck(StorageSchemaCompatibility.java:157)
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.shouldChangeSchema(StorageSchemaCompatibility.java:320)
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:361)
... 13 more
[2020-08-05 11:04:05,041] ERROR WorkerSinkTask{id=avro.inventory.customers.sink-0} Task threw an uncaught and unrecoverable exception (org.apache.kafka.connect.runtime.WorkerTask:186)
org.apache.kafka.connect.errors.ConnectException: Exiting WorkerSinkTask due to unrecoverable exception.
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:567)
at org.apache.kafka.connect.runtime.WorkerSinkTask.poll(WorkerSinkTask.java:325)
at org.apache.kafka.connect.runtime.WorkerSinkTask.iteration(WorkerSinkTask.java:228)
at org.apache.kafka.connect.runtime.WorkerSinkTask.execute(WorkerSinkTask.java:200)
at org.apache.kafka.connect.runtime.WorkerTask.doRun(WorkerTask.java:184)
at org.apache.kafka.connect.runtime.WorkerTask.run(WorkerTask.java:234)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:407)
at io.confluent.connect.hdfs.DataWriter.write(DataWriter.java:386)
at io.confluent.connect.hdfs.HdfsSinkTask.put(HdfsSinkTask.java:127)
at org.apache.kafka.connect.runtime.WorkerSinkTask.deliverMessages(WorkerSinkTask.java:545)
... 10 more
Caused by: org.apache.kafka.connect.errors.SchemaProjectorException: Schema version required for BACKWARD compatibility
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.validateAndCheck(StorageSchemaCompatibility.java:157)
at io.confluent.connect.storage.schema.StorageSchemaCompatibility.shouldChangeSchema(StorageSchemaCompatibility.java:320)
at io.confluent.connect.hdfs.TopicPartitionWriter.write(TopicPartitionWriter.java:361)
... 13 more
原因 我们看下下面的这段代码:
io.confluent.connect.hdfs.TopicPartitionWriter#write
if (currentSchema == null) {
if (compatibility != StorageSchemaCompatibility.NONE && offset != -1) {
String topicDir = FileUtils.topicDirectory(url, topicsDir, tp.topic());
CommittedFileFilter filter = new TopicPartitionCommittedFileFilter(tp);
FileStatus fileStatusWithMaxOffset = FileUtils.fileStatusWithMaxOffset(
storage,
new Path(topicDir),
filter
);
if (fileStatusWithMaxOffset != null) {
currentSchema = schemaFileReader.getSchema(
connectorConfig,
fileStatusWithMaxOffset.getPath()
);// (1) 读取 hdfs 的文件信息作为当前的currentSchema 信息
}
}
}
SinkRecord record = buffer.peek();
currentRecord = record;
Schema valueSchema = record.valueSchema(); // (2) 当前记录的Schema
if ((recordCounter <= 0 && currentSchema == null && valueSchema != null)
|| compatibility.shouldChangeSchema(record, null, currentSchema)) {
currentSchema = valueSchema;
if (hiveIntegration) {
createHiveTable();
alterHiveSchema();
}
if (recordCounter > 0) {
nextState();
} else {
break;
}
io.confluent.connect.storage.schema.StorageSchemaCompatibility#validateAndCheck
protected boolean validateAndCheck(Schema valueSchema, Schema currentSchema) {
if (currentSchema == null && valueSchema == null) {
return false;
} else if (currentSchema == valueSchema) {
return false;
} else if (currentSchema != null && valueSchema != null) {
if ((valueSchema.version() == null || currentSchema.version() == null) && this != NONE) {// (3) debug个时发现两个的值都是空
throw new SchemaProjectorException("Schema version required for " + this.toString() + " compatibility");
} else {
return this.check(valueSchema, currentSchema);
}
} else {
throw new SchemaProjectorException("Switch between schema-based and schema-less data is not supported");
}
}
当kafka connect 的task 重新调度的时候会读取 hdfs 文件信息生成currentSchema,获取到kafka 里面的数据生成valueSchema,通过代码中的(1)(2)(3)步骤可以看出 报错的原因。