出现的现象:
102采集程序半小时获取一次数据,第二次发送下游没收到数据,监控日志如下
可观察到[Thread2]线程被阻塞了
@Slf4j
public class TcpClient implements OneWayGwClient {
// 默认连接到端口8189
private String host;
// 默认连接到端口8189
private int port;
private Socket socket;
private BufferedWriter bufferedWriter;
private volatile boolean running = false;
public TcpClient(String ip, int port) {
this.host = ip;
this.port = port;
}
private boolean connect() {
log.info("Connect to ip and port {}-{}.", host, port);
if (this.isConnected()) {
log.warn("Socket already connected, ignored.");
return true;
}
try {
// 连接到服务器
this.socket = new Socket(host, port);
log.info("Connect socket successfully.");
this.bufferedWriter = new BufferedWriter(
new OutputStreamWriter(this.socket.getOutputStream())
);
return true;
} catch (IOException e) {
//捕获异常
log.error("Connect socket failed.",e);
return false;
}
}
public void disconnect() {
log.info("Disconnect to ip and port {}-{}.", host, port);
try {
if (!Objects.isNull(this.bufferedWriter)) {
this.bufferedWriter.close();
}
if (!Objects.isNull(this.socket) && !this.socket.isClosed()) {
log.info("Socket to close.");
socket.close();
}
//关闭Socket监听
} catch (IOException e) {
log.error("Close socket failed.",e);
}
}
@Override
public boolean isIdle() {
return false;
}
@Override
public void start() throws IOException {
log.info("Start to run.");
if (isRunning()) {
log.warn("Already running, ignored.");
return;
}
this.running = true;
connect();
}
@Override
public boolean isRunning() {
return this.running;
}
private boolean isConnected() {
if (Objects.isNull(this.bufferedWriter)) {
return false;
}
if (!Objects.isNull(this.socket) && this.socket.isConnected() && !this.socket.isClosed()) {
return true;
}
return false;
}
@Override
public OneWayGwResponse send(OneWayGwRequest oneWayGwRequest) {
if (!isRunning()) {
log.error("Not running.");
return OneWayGwResponse.ofFailed();
}
if (!isConnected()) {
log.error("Not connected.");
throw new OneWayGwException("Not connected.");
}
byte[] payload = oneWayGwRequest.getPayload();
try {
this.bufferedWriter.write(Arrays.toString(payload) + "\n");
this.bufferedWriter.flush();
} catch (IOException e) {
throw new OneWayGwException(e);
}
return OneWayGwResponse.ofSuccess();
}
@Override
public void close() throws IOException {
log.info("Stop to run.");
if (!isRunning()) {
log.warn("Already closed, ignored.");
return;
}
this.running = false;
disconnect();
}
}
通过代码走查发现,发送数据的方法这里可能会发生阻塞。以前程序没出现过这个问题,估计是这个现场网络比较不稳定造成的,没处理好原生socket连接异常的情况,长连接没有保活
处理方式
根据socket keepalive加了心跳机制
// 创建Socket对象
Socket socket = new Socket();
// 获取Socket选项对象
SocketOptions socketOptions = socket.getOptions();
// 设置SO_KEEPALIVE选项为true
socketOptions.setKeepAlive(true);
// 设置Keepalive间隔和时间(单位:秒)
socketOptions.setTcpKeepAlive(true);
socketOptions.setTcpKeepAliveTime(60);
socketOptions.setTcpKeepAliveInterval(30);
然后通过tcp dump抓包查看情况,以及查看日志
预期的报文没出现
需要修改操作系统的参数
# cat /etc/sysctl.conf
net.ipv4.tcp_keepalive_time = 60
net.ipv4.tcp_keepalive_probes = 2
net.ipv4.tcp_keepalive_intvl = 30
# 使得生效
sysctl -p
观察系统没有再出现断连丢数据情况
最好还是用netty作为通信的底层框架