23、实现“谁在线”服务的技术方案与优化-优快云博客

实现“谁在线”服务的技术方案与优化

在构建网站的“谁在线”服务时，我们最初尝试使用 MySQL 来实现，但遇到了性能瓶颈。下面将详细介绍整个过程以及最终的优化方案。

初始方案：使用 MySQL

我们最初的实现是通过 RSHUTDOWN 函数执行 REPLACE INTO SQL 语句将用户信息记录到 MySQL 的 recent_hits 表中。具体步骤如下：
1. PHP 扩展配置 ：
- 将相关文件放在一个目录中，运行 phpize && ./configure && make && make install 进行编译和安装。
- 在 php.ini 中添加 extension=RealtimeExtensionToAppendRequestData.so 。
2. 用户信息插入 ：在网站的认证代码中，使用 <?php newssite_poke_user($SiteUserID); ?> 将用户的 SiteUserID 插入到 Apache 的数据结构中。

然而，这个方案在性能上存在严重问题。我们的目标是每秒处理 500 个新用户到达和 2000 个后续页面视图，高峰时段每秒处理 9000 个动态页面加载。但实际测试中，每秒只能实现约 800 个操作，远低于预期。这是因为：
- 查询缓存失效 ：每次插入或更新操作都会使 MySQL 的查询缓存失效，导致全表扫描和索引范围扫描，尤其是对于热门 URL，性能更差。
- ACID 特性不适用 ：对于“谁在线”这个特定功能，ACID 特性并不是必需的，例如数据的持久性、隔离性等在 30 分钟的时间窗口内并不重要。
- 日志记录同步问题 ：页面视图的日志记录与页面服务同步进行，一旦 REPLACE INTO 语句变慢，整个网站都会受到影响。

优化方案：使用 spreadlogd

为了解决上述问题，我们选择了 spreadlogd 作为新的平台，通过被动日志分析来实现“谁在线”服务。

选择平台

spreadlogd 可以被动收集所有日志，并且支持可加载模块，适合处理每秒 9000 个请求的需求。我们使用 C 语言来编写模块，以便更好地控制数据结构。

定义数据结构

我们创建了 online.h 头文件，定义了一些默认值和简单的数据结构：

#ifndef _ONLINE_H_
#define _ONLINE_H_

#include "sld_config.h"
#include "module.h"
#include "echash.h"
#include "nethelp.h"
#include "skip_heap.h"

#define MAX_AGE 30*60
#define MAX_USER_INFO 30
#define DEFAULT_PORT 8989
#define DEFAULT_ADDRESS "*"

typedef struct urlcount {
  char *URL;               /* The URL, so we only store one copy  */
  unsigned int cnt;        /* count of users who last viewed this */
} urlcount_t;

typedef struct hit {
  unsigned long long SiteUserID;  /* The viewer's ID */
  char *URL;                      /* The URL, refs urls */
  time_t Hitdate;                 /* the time of the hit */
} hit_t;

void online_service(int fd, short event, void *vnl);
urlcount_t *get_url(const char *url, int len);
void cull_old_hits();
struct skiplistnode *get_first_hit_for_url(const char *url);
unsigned int get_current_online_count();

#endif

服务提供者

使用简单的二进制格式进行客户端 - 服务器通信。客户端发送请求，服务器返回总在线用户数、指定 URL 的在线用户数以及详细用户记录。以下是 online-server.c 的代码：

#include "online.h"
typedef struct user_hit_info {
  unsigned long long SiteUserID;  /* This viewer's ID */
  int age;                        /* Seconds since view */
  int pad;                        /* 4-byte alignment pad */
} user_hit_info_t;

static void get_hit_info(const char *url, unsigned int *total,
                         unsigned int *url_total,
                         user_hit_info_t *uinfo,
                         unsigned int *nusers) {
  struct skiplistnode *first;
  urlcount_t *uc;
  time_t now;
  int max_users = *nusers;

  /* Clean up any hits that are too old */
  cull_old_hits();
  *total = get_current_online_count();
  if((uc = get_url(url, strlen(url))) == NULL) {
    *nusers = *url_total = 0;
    return;
  } 
  now = time(NULL);
  *url_total = uc->cnt;
  first = get_first_hit_for_url(uc->URL);
  *nusers = 0;
  if(!first) return; /* something is very wrong */
  while(*nusers < max_users && first) {
    hit_t *hit = (hit_t *)first->data;
    /* keep going until we see a new URL */
    if(!hit || strcmp(hit->URL,uc->URL)) break;
    uinfo[*nusers].SiteUserID = hit->SiteUserID;
    uinfo[*nusers].age = now - hit->Hitdate;
    (*nusers)++;
    sl_next(&first);
  }
  return;
}

void online_service(int fd, short event, void *vnl) {
  netlisten_t *nl = (netlisten_t *)vnl;
  int expected_write, actual_write, i;
  struct iovec io[4];
  unsigned int total, url_total, nusers = MAX_USER_INFO;
  user_hit_info_t uinfo[MAX_USER_INFO];
  struct {
    unsigned short sofar;
    unsigned short ulen;
    char *url;
  } *req;

  if(NULL == (req = nl->userdata))
    nl->userdata = req = calloc(1, sizeof(*req));

  if(!req->ulen) {
    /* Read the length the URL to be passed (network short) */
    if(read(nl->fd, &req->ulen, sizeof(req->ulen)) !=
       sizeof(req->ulen)) goto bail;
    req->ulen = ntohs(req->ulen);
    /* Allocate our read length plus a terminator */
    req->url = malloc(req->ulen + 1);
    req->url[req->ulen] = '\0';
  }

  while(req->sofar < req->ulen) {
    int len = read(nl->fd, req->url, req->ulen - req->sofar);
    if(len == -1 && errno == EAGAIN) return; /* incomplete read */
    if(len <= 0) goto bail;                  /* error */
    req->sofar += len;
  }

  /* Get the answer */
  get_hit_info(req->url, &total, &url_total, uinfo, &nusers);

  /* Pack it on the network */
  expected_write = sizeof(total) * 3 + nusers * sizeof(*uinfo);
  io[0].iov_base = &total;     io[0].iov_len = sizeof(total);
  io[1].iov_base = &url_total; io[1].iov_len = sizeof(url_total);
  io[2].iov_base = &nusers;    io[2].iov_len = sizeof(nusers);
  io[3].iov_base = uinfo;
  io[3].iov_len = nusers * sizeof(*uinfo);

  total = htonl(total);
  url_total = htonl(url_total);
  for(i=0;i<nusers;i++) {
    uinfo[i].SiteUserID = bswap64(uinfo[i].SiteUserID);
    uinfo[i].age = htonl(uinfo[i].age);
  }
  nusers = htonl(nusers);

  /* We should be able to write it all at once. We don't support */
  /* command pipelining, so the total contents of the outbound   */
  /* buffer will only ever be this large.                        */
  actual_write = writev(nl->fd, io, 4);
  if(actual_write != expected_write) goto bail;

  free(req->url);
  memset(req, 0, sizeof(*req));
  return;

bail:
  if(req) {
    if(req->url) free(req->url);
    free(req);
  }
  close(nl->fd);
  event_del(&nl->e);
  return;
}

信息收集器

通过 online.c 文件实现信息收集，将 URL 存储在哈希表中，用户的最后一次访问信息存储在多索引跳表中。代码如下：

#include "online.h"

static Skiplist hits;      /* Tracks each users's last hit */
static ec_hash_table urls; /* Tracks the count on each URL */

void urlcount_free(void *vuc) {
  if(((urlcount_t *)vuc)->URL) free(((urlcount_t *)vuc)->URL);
  free(vuc);
}

urlcount_t *get_url(const char *url, int len) {
  void *uc;
  if(echash_retrieve(&urls, url, len, &uc)) return uc;
  return NULL;
}

static void urlcount_decrement(const char *url) {
  urlcount_t *uc;
  if((uc = get_url(url, strlen(url))) != NULL) {
    if(!(--uc->cnt))
      echash_delete(&urls, url, strlen(url), NULL, urlcount_free);
  }
}

void hit_free(void *vhit) {
  urlcount_decrement(((hit_t *)vhit)->URL);
  free(vhit);
}

/* comparator for the URL,Hitdate index */
static int url_hitdate_comp(const void *a, const void *b) {
  int ret = strcmp(((hit_t *)a)->URL, ((hit_t *)b)->URL);
  if(ret) return ret;
  /* Newest (greatest) in front */
  return (((hit_t *)a)->Hitdate < ((hit_t *)b)->Hitdate)?1:-1;
}

/* comparator for the Hitdate */
static int hitdate_comp(const void *a, const void *b) {
  /* Oldest in front... so we can pop off expired ones */
  return (((hit_t *)a)->Hitdate < ((hit_t *)b)->Hitdate)?-1:1;
}

/* comparator for the SiteUserID */
static int SiteUserID_comp(const void *a, const void *b) {
  if(((hit_t *)a)->SiteUserID == ((hit_t *)b)->SiteUserID) return 0;
  if(((hit_t *)a)->SiteUserID < ((hit_t *)b)->SiteUserID) return -1;
  return 1;
}

static int SiteUserID_comp_key(const void *a, const void *b) {
  if(*((unsigned long long *)a) == ((hit_t *)b)->SiteUserID) return 0;
  if(*((unsigned long long *)a) < ((hit_t *)b)->SiteUserID) return -1;
  return 1;
}

unsigned int get_current_online_count() {
  return hits.size;
}

void cull_old_hits() {
  hit_t *hit;
  time_t oldest;
  oldest = time(NULL) - MAX_AGE;
  while((hit = sl_peek(&hits)) != NULL && (hit->Hitdate < oldest))
    sl_pop(&hits, hit_free);
}

struct skiplistnode *get_first_hit_for_url(const char *url) {
  struct skiplistnode *match, *left, *right;
  hit_t target;
  target.URL = (char *)url;
  /* ask for the node one second in the future.  We'll miss and */
  /* 'right' will point to the newest node for that URL.        */
  target.Hitdate = time(NULL) + 1;
  sl_find_compare_neighbors(&hits, &target, &match, &left, &right,
                            url_hitdate_comp);
  return right;
}

static int online_init(const char *config) {
  char *host = NULL, *sport = NULL;
  unsigned int port;
  echash_init(&urls);
  sl_init(&hits);
  sl_set_compare(&hits, hitdate_comp, hitdate_comp);
  sl_add_index(&hits, SiteUserID_comp, SiteUserID_comp_key);
  sl_add_index(&hits, url_hitdate_comp, url_hitdate_comp);

  if(config) host = strdup(config);
  if(host) sport = strchr(host, ':');
  if(sport) {
    *sport++ = '\0';
    port = atoi(sport);
  } else
    port = DEFAULT_PORT;
  if(!host) host = DEFAULT_ADDRESS;
  if(tcp_dispatch(host, port, 100, EV_READ|EV_PERSIST, online_service,
                  NULL) < 0) {
    fprintf(stderr, "Could not start service on %s\n", config);
    return -1;
  }
  return 0;
}

static void online_shutdown() {
  fprintf(stderr, "Stopping online module.\n");
}

#define SET_FROM_TOKEN(a,b) do { \
  a ## _len = tokens[(b)+1]-tokens[(b)]-1; \
  a = tokens[(b)]; \
} while(0)

static void online_logline(SpreadConfiguration *sc,
       const char *sender, const char *group, const char *message) {
  const char *tokens[8];
  const char *user, *url;
  unsigned long long SiteUserID;
  int user_len, url_len;
  urlcount_t *uc;
  hit_t *hit;
  int i;

  tokens[0] = message;
  for(i=1; i<8; i++) {
    tokens[i] = strchr(tokens[i-1], ' ');
    if(!tokens[i]++) return;  /* couldn't find token */
  }
  /* the userid is field 3 and the URI is field 7 based on white space */
  SET_FROM_TOKEN(user, 2);
  SET_FROM_TOKEN(url, 6);

  SiteUserID = strtoul(user, NULL, 10);
  /* Find the URL in the URL counts, creating if necessary */
  if((uc = get_url(url, url_len)) == NULL) {
    uc = calloc(1, sizeof(*uc));
    uc->URL = malloc(url_len+1);
    memcpy(uc->URL, url, url_len);
    uc->URL[url_len] = '\0';
    echash_store(&urls, uc->URL, url_len, uc);
  }
  /* Increment the counter on the URL */
  uc->cnt++;

  /* Fetch this users's last hit */
  hit = sl_find_compare(&hits, &SiteUserID, NULL, SiteUserID_comp);
  if(!hit) {
    /* No hit for this user, allocate one */
    hit = calloc(1, sizeof(*hit));
  }
  else {
    /* We have an old hit.  We must reduce the count on the old URL.
     * it is not our string, so we don't free it. */
    sl_remove_compare(&hits, &SiteUserID, NULL, SiteUserID_comp);
    urlcount_decrement(hit->URL);
  }
  hit->URL = uc->URL;
  hit->SiteUserID = SiteUserID;
  hit->Hitdate = time(NULL);
  sl_insert(&hits, hit);
  cull_old_hits();
}

sld_module_abi_t online = {
  "online",
  online_init,
  online_logline,
  online_shutdown
};

模块加载

将编译好的模块安装到 /opt/spreadlogd/modules/online.so ，并配置 spreadlogd.conf 文件：

BufferSize = 65536
ModuleDir = /opt/spreadlogd/libexec
LoadModule online *:8989
Spread {
  Port = 4803
  Log {
    Group newssite
    ModuleLog online
  }
}

编写客户端

由于网站使用 Perl 编写，我们编写了客户端模块来实现客户端 - 服务器协议。以下是 NewsSite/WhosOnline.pm 和 NewsSite/WhosOnline/Inet.pm 的代码：

NewsSite/WhosOnline.pm ：

package NewsSite::WhosOnline;

use Net::WhosOnline::INET;
use vars qw/$VERSION/;
use strict;
use bigint;

$VERSION = "1.0";

sub new {
  my $class = shift;
  return NewsSite::WhosOnline::INET->new(@_);
}

sub query {
  my($self, $url) = @_;
  $url = pack('n', length($url)) . $url;
  # Binary net-strings-style request
  my $wlen = $self->syswrite($url, length($url));
  die if(!defined($wlen) || ($wlen != length($url)));
  return $self->read_response;
}

sub getcounts {
  my $self = shift;
  my $pss;
  # 4 bytes x (total, url_total, nusers)
  die if($self->sysread($pss, 12) != 12);
  return unpack('NNN', $pss);
}

sub getuserinfo {
  my ($self, $count) = @_;
  my ($pss, @hits);
  # The structure handed back to us is (for each user)
  # 8 bytes of SiteUserID, 4 bytes of age in seconds
  # and 4 bytes of padding.
  die if($self->sysread($pss, 16 * $count) != (16 * $count));
  my @part = unpack('NNNN' x $count, $pss);
  while(@part) {
    # this little trick will allow bigint to kick in without
    # rolling an int on 32bit systems.
    my $rv = $part[0];
    $rv *= 0xffffffff;
    $rv += $part[0] + $part[1];
    push @hits, [ "$rv", $part[2] ];
    # We don't do anything with $part[3], it's padding.
    splice(@part, 0, 4); # eat these 4, and onto the next
  }
  return \@hits;
}

sub read_response {
  my $self = shift;
  my $response;
  eval {
    my ($total, $url_total, $nusers) = $self->getcounts;
    $response = { total       => $total,
                  url_total   => $url_total,
                  recent_hits => $self->getuserinfo($nusers) };
  };
  return $@?undef:$response;
}

1;

NewsSite/WhosOnline/Inet.pm ：

package NewsSite::WhosOnline::INET;

use NewsSite::WhosOnline;
use IO::Socket::INET;
use vars qw/$VERSION @ISA/;
use strict;

$VERSION = "1.0";
@ISA = (qw/Net::WhosOnline IO::Socket::INET/);

sub new {
  my ($class, $hostname, %args) = @_;
  $args{Port} ||= 8989;  # set the default port
  return $class->IO::Socket::INET::new(PeerAddr => $hostname,
                                       PeerPort => $args{Port},
                                       Proto => 'tcp',
                                       Timeout => $args{Timeout});
}

1;

总结

通过使用 spreadlogd 进行被动日志分析，我们解决了 MySQL 方案中的性能问题，实现了一个高效的“谁在线”服务。整个过程包括选择平台、定义数据结构、实现服务提供者和信息收集器、加载模块以及编写客户端等步骤。

以下是整个流程的 mermaid 流程图：

graph LR
    A[初始方案：使用 MySQL] --> B[性能问题]
    B --> C[优化方案：使用 spreadlogd]
    C --> D[选择平台]
    D --> E[定义数据结构]
    E --> F[服务提供者]
    E --> G[信息收集器]
    F --> H[模块加载]
    G --> H
    H --> I[编写客户端]

通过这个优化方案，我们能够更好地满足网站的性能需求，为用户提供更流畅的体验。

实现“谁在线”服务的技术方案与优化

技术点分析

数据结构选择

在优化方案中，我们使用了哈希表和多索引跳表来存储数据。这种选择是基于对性能和功能的考虑：
- 哈希表（ ec_hash_table ） ：用于存储 URL 及其访问计数。哈希表的查找和插入操作平均时间复杂度为 O(1)，非常适合快速查找某个 URL 的访问计数。例如，在 get_url 函数中，通过 echash_retrieve 可以快速查找 URL 是否存在于哈希表中。
- 多索引跳表（ Skiplist ） ：用于存储用户的最后一次访问信息。跳表支持多个索引，如 SiteUserID 、 Hitdate 和 URL-Hitdate ，可以方便地进行范围查找和排序。例如，在 cull_old_hits 函数中，利用跳表按 Hitdate 排序的特性，快速移除过期的访问记录。

数据结构	优点	适用场景
哈希表	查找和插入速度快	快速查找 URL 及其访问计数
多索引跳表	支持范围查找和排序，插入和删除操作效率高	存储用户的最后一次访问信息，方便按不同条件查询

时间复杂度分析

以下是几个关键函数的时间复杂度分析：
- get_current_online_count ：返回在线用户总数，时间复杂度为 O(1)，因为只需要返回跳表的大小。
- cull_old_hits ：移除过期的访问记录，时间复杂度为 O(k)，其中 k 是过期记录的数量。由于跳表按 Hitdate 排序，过期记录位于跳表头部，移除操作效率高。
- get_first_hit_for_url ：查找指定 URL 的最新访问记录，时间复杂度为 O(log n)，其中 n 是跳表中记录的数量。通过 sl_find_compare_neighbors 函数在跳表中查找。

客户端 - 服务器通信

客户端和服务器之间使用简单的二进制格式进行通信，这种方式减少了数据传输量，提高了通信效率。具体步骤如下：
1. 客户端发送请求 ：客户端连接到服务器，发送一个 16 位网络字节序的无符号整数，表示要查询的 URL 的长度，随后发送 URL。
2. 服务器处理请求 ：服务器接收请求，调用 get_hit_info 函数获取相关信息，包括总在线用户数、指定 URL 的在线用户数以及详细用户记录。
3. 服务器返回响应 ：服务器将结果作为三个 32 位无符号整数（总在线用户数、指定 URL 的在线用户数、详细用户记录数量）和详细用户记录返回给客户端。

代码解读

`online.h` 头文件

该头文件定义了一些常量和数据结构，以及相关函数的声明。例如：
- MAX_AGE ：定义了用户访问记录的有效时间，即 30 分钟。
- urlcount_t ：用于存储 URL 及其访问计数。
- hit_t ：用于存储用户的最后一次访问信息，包括 SiteUserID 、 URL 和 Hitdate 。

`online-server.c` 文件

该文件实现了服务器端的功能，主要包括 get_hit_info 和 online_service 函数：
- get_hit_info ：根据 URL 获取相关信息，包括总在线用户数、指定 URL 的在线用户数以及详细用户记录。在获取信息之前，会调用 cull_old_hits 函数移除过期记录。
- online_service ：处理客户端请求，包括读取请求、调用 get_hit_info 函数获取信息、将信息打包成网络字节序并发送给客户端。

`online.c` 文件

该文件实现了信息收集器的功能，包括初始化、日志处理和关闭操作：
- online_init ：初始化哈希表和跳表，设置比较函数和索引，注册服务。
- online_logline ：处理每条日志记录，更新 URL 访问计数和用户的最后一次访问信息。
- online_shutdown ：关闭服务时输出提示信息。

实际应用与拓展

应用场景

该“谁在线”服务可以应用于各种网站，如新闻网站、社交网站等，为用户提供实时的在线用户信息，增强用户体验。例如，新闻网站可以在页面上显示当前在线用户数、某篇文章的阅读人数以及最近阅读该文章的用户列表。

拓展功能

可以在现有基础上进行拓展，实现更多功能：
- 实时统计分析 ：对用户的访问行为进行实时统计分析，如分析热门文章、用户访问时间分布等。
- 用户行为预测 ：根据用户的历史访问记录，预测用户的未来行为，为用户提供个性化的推荐。
- 安全监控 ：监控异常的用户访问行为，如频繁访问、异常 IP 地址等，提高网站的安全性。

总结与展望

通过使用 spreadlogd 进行被动日志分析，我们成功解决了 MySQL 方案中的性能问题，实现了一个高效的“谁在线”服务。整个方案包括选择合适的平台、定义数据结构、实现服务提供者和信息收集器、加载模块以及编写客户端等步骤。

在未来的开发中，我们可以进一步优化代码，提高系统的性能和稳定性。例如，使用更高效的数据结构和算法，优化客户端 - 服务器通信协议等。同时，可以根据实际需求拓展功能，为用户提供更丰富的服务。

以下是整个优化方案的总结流程图：

graph LR
    A[选择平台：spreadlogd] --> B[定义数据结构：哈希表和多索引跳表]
    B --> C[实现服务提供者：处理客户端请求]
    B --> D[实现信息收集器：处理日志记录]
    C --> E[模块加载：配置 spreadlogd.conf]
    D --> E
    E --> F[编写客户端：实现客户端 - 服务器通信]
    F --> G[应用与拓展：实时统计分析、用户行为预测等]

通过不断优化和拓展，我们可以为网站提供更强大的功能和更好的用户体验。