【ODPS】TableTunnel多线程下载事例

本文介绍如何使用多线程技术从阿里云ODPS高效下载大数据文件。通过创建多线程下载类并利用TableTunnel接口实现分区表的数据读取,显著提高了下载速度。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

上篇写了ODPS单线程简单下载http://blog.youkuaiyun.com/jyl932099427/article/details/47660249

本篇介绍多线程下载


1.多线程下载类:

package bysql;

import java.io.BufferedWriter;
import java.io.IOException;
import java.util.Date;
import java.util.concurrent.Callable;

import com.aliyun.odps.Column;
import com.aliyun.odps.TableSchema;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.data.RecordReader;

public class DownloadThread implements Callable<Long> {

	private long id;
	private RecordReader recordReader;
	private TableSchema tableSchema;
	private BufferedWriter out;

	public DownloadThread(int id, RecordReader recordReader,
			TableSchema tableSchema, BufferedWriter out) {
		this.id = id;
		this.recordReader = recordReader;
		this.tableSchema = tableSchema;
		this.out = out;
	}

	@Override
	public Long call() throws Exception {
		Long recordNum = 0L;
		try {
			Record record;
			while ((record = recordReader.read()) != null) {
				recordNum++;
				consumeRecord(record, tableSchema, out, id);
			}
			recordReader.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return recordNum;
	}

	private static void consumeRecord(Record record, TableSchema schema,
			BufferedWriter out, long id) throws IOException {
		String writeStr = "";
		String str;
		for (int i = 0; i < schema.getColumns().size(); i++) {
			Column column = schema.getColumn(i);
			String colValue = null;
			switch (column.getType()) {
			case BIGINT: {
				Long v = record.getBigint(i);
				colValue = v == null ? null : v.toString();
				break;
			}
			case BOOLEAN: {
				Boolean v = record.getBoolean(i);
				colValue = v == null ? null : v.toString();
				break;
			}
			case DATETIME: {
				Date v = record.getDatetime(i);
				colValue = v == null ? null : v.toString();
				break;
			}
			case DOUBLE: {
				Double v = record.getDouble(i);
				colValue = v == null ? null : v.toString();
				break;
			}
			case STRING: {
				String v = record.getString(i);
				colValue = v == null ? null : v.toString();
				break;
			}
			default:
				throw new RuntimeException("Unknown column type: "
						+ column.getType());
			}
			str = colValue == null ? "null" : colValue;
			if (i != schema.getColumns().size() - 1) {
				str = schema.getColumn(i).getName() + ":" + str + ",        ";
			} else {
				str = schema.getColumn(i).getName() + ":" + str;
			}
			writeStr = writeStr + str;
		}
		writeStr = "【Thread " + id + "】" + writeStr
				+ System.getProperty("line.separator");
		out.write(writeStr);
	}

}


2.多线程下载事例:

package bysql;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import com.aliyun.odps.Odps;
import com.aliyun.odps.PartitionSpec;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.RecordReader;
import com.aliyun.odps.tunnel.TableTunnel;
import com.aliyun.odps.tunnel.TunnelException;

public class DownloadThreadSample {

	private static final String ACCESS_ID = "<your access id>";
	private static final String ACCESS_KEY =  "<your access Key>";
	private static final String PROJECT_NAME = "<your project>";
	private static final String TUNNEL_URL = "<your tunnel endpoint>";
	private static final String ODPS_URL = "<your odps endpoint>";

	public static void main(String[] args) {
		
		String tableName = "point_z";//表名

		/* 先构建阿里云帐号 */
		Account account = new AliyunAccount(ACCESS_ID, ACCESS_KEY);


		/* Odps类是ODPS SDK的入口 */
		Odps odps = new Odps(account);
		odps.setDefaultProject(PROJECT_NAME);// 指定默认使用的Project名称
		odps.setEndpoint(ODPS_URL);// 设置ODPS服务的地址
		
		/*访问ODPS Tunnel服务的入口类*/
		TableTunnel tunnel = new TableTunnel(odps);
		tunnel.setEndpoint(TUNNEL_URL);//设置TunnelServer地址

		try {
			/*此处表point_z为分区表,下载时必须指定分区
			 * 指定下载分区
			 * */
			PartitionSpec partitionSpec = new PartitionSpec();
			partitionSpec.set("z", "1");
			
			System.out.println("开始下载数据.........");
			File file = new File("G:\\"+tableName+"(多线程).txt");
			if (file.exists()){
				file.delete();
			}
			file.createNewFile();
			BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
					new FileOutputStream(file, true), "utf-8"));
			
			long startTime = System.currentTimeMillis();
			
			TableTunnel.DownloadSession downloadSession = tunnel
					.createDownloadSession(PROJECT_NAME, tableName,partitionSpec);

			long count = downloadSession.getRecordCount();
			System.out.println("RecordCount is: " + count);

			int threadNum=6;
			ExecutorService pool = Executors.newFixedThreadPool(threadNum);
			ArrayList<Callable<Long>> callers = new ArrayList<Callable<Long>>();

			long start = 0;
			long step = count / threadNum;
			for (int i = 0; i < threadNum - 1; i++) {
				RecordReader recordReader = downloadSession.openRecordReader(
						step * i, step);
				callers.add(new DownloadThread(i, recordReader, downloadSession
						.getSchema(),out));
			}
			RecordReader recordReader = downloadSession.openRecordReader(step
					* (threadNum - 1), count - ((threadNum - 1) * step));
			callers.add(new DownloadThread(threadNum - 1, recordReader,
					downloadSession.getSchema(),out));

			Long downloadNum = 0L;
			List<Future<Long>> recordNum = pool.invokeAll(callers);
			for (Future<Long> num : recordNum)
				downloadNum += num.get();
			System.out.println("DownLoad Count is: " + downloadNum);
			pool.shutdown();
			out.close();
			long endTime = System.currentTimeMillis();
	        System.out.println("总共耗时:" + (endTime - startTime) + " ms");
			System.out.println("-------------------------------------------------");

		} catch (TunnelException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (ExecutionException e) {
			e.printStackTrace();
		}

	}

}


使用PyODPS连接池进行多线程查询ODPS需要注意以下事项: 1. 连接池是线程安全的,但是同一时刻只能有一个连接使用。因此,需要在多线程中正确地使用连接池。 2. 如果要查询不同的表,建议每个线程使用一个连接池,以避免出现连接池竞争的情况。 3. 如果要查询同一个表,可以让多个线程共用一个连接池,但需要注意每个查询的结果集不要互相干扰。 下面是一个简单的多线程查询ODPS的示例代码: ```python from concurrent.futures import ThreadPoolExecutor from odps import ODPS from odps.tunnel import TableTunnel from odps.tunnel.pool import TableTunnelPool # 创建ODPS连接 odps = ODPS('your_access_id', 'your_access_key', 'your_project_name', 'your_endpoint') def query_table(table_name): # 创建TableTunnel连接池 pool = TableTunnelPool(odps) # 从连接池中获取TableTunnel连接 tunnel = pool.get_tunnel() # 通过TableTunnel连接进行数据传输 with tunnel.execute_sql(f'SELECT * FROM {table_name} LIMIT 10').open_reader() as reader: for record in reader: print(f'Table {table_name}, record: {record}') # 创建线程池 executor = ThreadPoolExecutor(max_workers=5) # 提交任务到线程池 table_names = ['table1', 'table2', 'table3', 'table4', 'table5'] for table_name in table_names: executor.submit(query_table, table_name) ``` 以上代码中,首先创建了ODPS连接,然后创建了一个线程池,并提交了5个任务到线程池。每个任务都会使用一个独立的连接池,并查询指定的表。在查询过程中,每个线程都会从连接池中获取一个连接,并使用该连接进行数据查询。查询完成后,连接会自动回收到连接池中,不需要手动关闭。 需要注意的是,以上示例中使用了线程池,线程池的最大并发数由max_workers参数指定,默认值为当前机器的CPU核心数。如果需要修改线程池的配置,可以通过传递参数进行配置,例如: ```python executor = ThreadPoolExecutor(max_workers=10) ``` 以上代码中,将线程池的最大并发数设置为10。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值