nodejs实践录:使用pm2实现服务的“平滑更新”

本文介绍如何使用pm2实现业务程序的“平滑更新”。

背景

正常运行的程序难免会遇到升级的问题。如何让客户对升级无感,是一个大问题。本文不涉及CICD方面的AB测试、灰度升级等,只是利用mp2的特性来实现“平滑更新”目的,文末给出结论。

代码

本文的测试程序分服务端和客户端,测试时,先让服务端运行,然后让客户端一直连接服务器,接着使用不同的pm2配置更新服务器,观察客户端的连接状态。

服务端代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/*
用于测试http服务器连接性,每0.5秒发送GET请求,如连接不上,返回错误,打印err
*/

var request = require('request');

const log = require('../lib/log.js');

function httpGet(url, requestData)
{
var head = {
url: url,
method: "GET",
json: true,
headers: {
"content-type": "application/json",
'User-Agent': 'git-oschina-hook',
'X-Gitee-Token': 'webhook password',
'X-Gitee-Event': 'Push Hook'
},
body: requestData
// body: JSON.stringify(requestData)
};

request(head, function fn(error, response, body) {
if (!error && response.statusCode == 200) {
log.print('=============GET content: \n', body) // 成功
}else{
log.err('err');
}
});
}

function getHttp()
{
var url = 'http://127.0.0.1:4000/';
var msg = new Object();
msg['devid'] = '123456798';
msg['station'] = '01';
msg['stateCode']=='501';
//log.print('will get the http.');
httpGet(url, msg);
}

var g_loopId = null;

function startLoop()
{
g_loopId = setInterval(loopFunc, 500);
}

function stopLoop()
{
clearInterval(g_loopId);
}

function loopFunc()
{
getHttp();
}

function main()
{
log.print('start looping...');
startLoop();
}

main();

客户端代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/*
用于测试http服务器连接性,每0.5秒发送GET请求,如连接不上,返回错误,打印err
*/

var request = require('request');

const log = require('../lib/log.js');

function httpGet(url, requestData)
{
var head = {
url: url,
method: "GET",
json: true,
headers: {
"content-type": "application/json",
'User-Agent': 'git-oschina-hook',
'X-Gitee-Token': 'webhook password',
'X-Gitee-Event': 'Push Hook'
},
body: requestData
// body: JSON.stringify(requestData)
};

request(head, function fn(error, response, body) {
if (!error && response.statusCode == 200) {
log.print('=============GET content: \n', body) // 成功
}else{
log.err('err');
}
});
}

function getHttp()
{
var url = 'http://127.0.0.1:4000/';
var msg = new Object();
msg['devid'] = '123456798';
msg['station'] = '01';
msg['stateCode']=='501';
//log.print('will get the http.');
httpGet(url, msg);
}

var g_loopId = null;

function startLoop()
{
g_loopId = setInterval(loopFunc, 500);
}

function stopLoop()
{
clearInterval(g_loopId);
}

function loopFunc()
{
getHttp();
}

function main()
{
log.print('start looping...');
startLoop();
}

main();

ecosystem.config.js配置文件:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
module.exports = {
// APP是一个数组,可以有多个
// 参数参考:https://pm2.io/doc/en/runtime/reference/ecosystem-file/
apps : [
{
name: 'app',
script: 'server.js',
args: 'null',
instances: 2,
exec_mode: "cluster", // 集群模式,如不指定,默认为fork
autorestart: false,
min_uptime: "60s",
max_restarts: 3,
watch: false,
//error_file: "./logs/app-err.log",
//out_file: "./logs/app-out.log",
log: "./logs/app.log",
//log_date_format: "YYYY-MM-DD HH:mm Z", // pm2 log添加日期
max_memory_restart: '1G',
//listen_timeout: 3000,
kill_timeout: 3000,
// wait_ready: true,
env:
{
NODE_ENV: 'development'
},
env_production:
{
NODE_ENV: 'production'
}
}
],

// 部署,暂未使用,不用理
deploy : {
production : {
user : 'node',
host : '212.83.163.1',
ref : 'origin/master',
repo : 'git@github.com:repo.git',
path : '/var/www/production',
'post-deploy' : 'npm install && pm2 reload ecosystem.config.js --env production'
}
}
};

测试

使用pm2 start启动服务。此时使用客户端连接,正常连接。此处不表。

使用pm2 reload all重新启动。此时,输出如果日志:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// 1、先启动2个进程,并提示在线
PM2 | App [app:0] starting in -cluster mode-
PM2 | App [app:1] starting in -cluster mode-
PM2 | App [app:1] online
PM2 | App [app:0] online
1|app | [2019-01-06 15:46:44.628] httpserver is listening on 4000
0|app | [2019-01-06 15:46:44.633] httpserver is listening on 4000
// 2、再停止旧进程
PM2 | -reload- New worker listening
PM2 | Stopping app:app id:_old_1
PM2 | -reload- New worker listening
PM2 | Stopping app:app id:_old_0
// 3、旧进程退出
PM2 | App name:app id:_old_1 disconnected
PM2 | App [app:_old_1] exited with code [1] via signal [SIGINT]
PM2 | App name:app id:_old_0 disconnected
PM2 | App [app:_old_0] exited with code [1] via signal [SIGINT]
PM2 | pid=16952 msg=process killed
PM2 | pid=4504 msg=process killed

// 4、新的进程开始提供服务

下面看看pm2 restart all的结果,输出日志如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
PM2         | App name:app id:0 disconnected
PM2 | App [app:0] exited with code [1] via signal [SIGINT]
PM2 | App name:app id:1 disconnected
PM2 | App [app:1] exited with code [1] via signal [SIGINT]
PM2 | pid=15528 msg=process killed
PM2 | App [app:0] starting in -cluster mode-
PM2 | pid=11808 msg=process killed
PM2 | App [app:1] starting in -cluster mode-
PM2 | App [app:1] online
PM2 | (node:10372) [DEP0007] DeprecationWarning: worker.suicide is deprecated. Please use worker.exitedAfterDisconnect.
PM2 | App [app:0] online
PM2 | (node:10372) [DEP0007] DeprecationWarning: worker.suicide is deprecated. Please use worker.exitedAfterDisconnect.
1|app | [2019-01-06 16:04:55.163] httpserver is listening on 4000
0|app | [2019-01-06 16:04:55.192] httpserver is listening on 4000

由于pm2日志没有打印时间(可以配置,但本文没有做),所以无法准确知道重启所花时间,但根据笔者目测,是在3~5秒之间,这个时间值仅限于本例,通过其它实践知道pm2重启速度还是比较快的。

看看这个时候客户端的信息:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
[2019-01-06 16:04:50.151]  =============GET content:
here is my voice: Hello World! now is:
Sun Jan 06 2019 16:04:50 GMT+0800 (中国标准时间) cnt: 52

[2019-01-06 16:04:51.650] err
[2019-01-06 16:04:52.154] err
[2019-01-06 16:04:52.654] err
[2019-01-06 16:04:53.160] err
[2019-01-06 16:04:53.657] err
[2019-01-06 16:04:54.159] err
[2019-01-06 16:04:54.857] err
[2019-01-06 16:04:55.241] =============GET content:
here is my voice: Hello World! now is:
Sun Jan 06 2019 16:04:55 GMT+0800 (中国标准时间) cnt: 1

[2019-01-06 16:04:55.677] =============GET content:
here is my voice: Hello World! now is:
Sun Jan 06 2019 16:04:55 GMT+0800 (中国标准时间) cnt: 2

可以看到,有5秒左右时间是打印“err”,即连接不到服务器

小结

对于socket而言,使用短连接的,可以实现“平滑”,因为每次都使用新的socket去连接,比如http请求。但是,如果是长连接,始终存在socket切换的时刻,因为毕竟进程是重新启动了的。这时,socket客户端会进行重新连接了。所以有一定局限性(但也是没办法的,进程重启,是不能保持socket连接的)。
笔者接触过的socket客户端,都有重连的机制。可以在实际工作中,不妨考虑本文的方案。

李迟 2019.2.5 周二