Ts实现基础的爬虫

安装项目需要的依赖

1
2
pnpm i superagent
pnpm i cheerio

起步

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/*
* @Description:
* @Author: xiuji
* @Date: 2023-07-05 16:54:21
* @LastEditTime: 2023-08-01 14:09:15
* @LastEditors: Do not edit
*/
import fs from 'fs'; // nodejs自带的文件模块
import path from 'path'; // nodejs自带的路径模块
import superangent from 'superagent'; // 爬虫
import cheerio from 'cheerio'; // 服务端的jquery

interface Movie {
post?: string;
name?: string;
}

interface MovieJson {
time: string;
data: Movie[];
}

interface Content {
[propName: string]: Movie[];
}

class Crawler {
// 目标网址
private targetUrl: string = 'https://movie.douban.com/cinema/nowplaying/nanjing/';
// 查询时间
private time: string = '';
// 文件路径
private filePath: string = path.resolve(__dirname, '../data/movie.json');
constructor() {
this.time = this.formatTimestampToDateString(new Date().getTime());
this.init();
}
/**
* @Description: 获取网页html
* @Date: 2023-07-24 10:56:50
*/
async getHtml() {
const result = await superangent.get(this.targetUrl);
return result.text;
};
/**
* @Description: 处理数据
* @Date: 2023-07-24 10:03:37
* @param {string} html
*/
async processingData(html: string): Promise<Movie[]> {
let movieList: Movie[] = [];
const $ = cheerio.load(html);
// 获取正在上映的电影列表
const playingLists = $('#nowplaying .list-item');
playingLists.each((i, ele) => {
const moviePoster = $(ele).find('.poster a').attr('href');
const movieName = $(ele).find('.stitle a').attr('title');
movieList.push(
{
post: moviePoster,
name: movieName
}
)
});
return Promise.resolve(movieList);
};
/**
* @Description: 格式化时间戳为日期字符串
* @Date: 2023-07-31 15:20:33
* @param {number} timestamp
*/
formatTimestampToDateString(timestamp: number): string {
const date = new Date(timestamp);
const year = date.getFullYear();
const month = String(date.getMonth() + 1).padStart(2, '0'); // Month is zero-based, so we add 1
const day = String(date.getDate()).padStart(2, '0');
const formattedDateString = `${year}-${month}-${day}`;
return formattedDateString;
}
/**
* @Description: 存储数据
* @Author:
* @param {MovieJson} listData
* @return {*}
* @Date: 2023-08-01 14:09:32
*/
storageData(listData: MovieJson): Content | undefined {
try {
let fileContent: Content = {};
// fs.existsSync() 判断文件是否存在
if (fs.existsSync(this.filePath)) {
fileContent = JSON.parse(fs.readFileSync(this.filePath, 'utf-8'));
}
fileContent[listData.time] = listData.data;
return fileContent;
} catch (error) {
console.log(error);
}
};
/**
* @Description: 创建json文件
* @Date: 2023-08-01 13:54:33
* @param {Content} fileContent
*/
createJsonFile(fileContent: Content) {
fs.writeFileSync(this.filePath, JSON.stringify(fileContent));
}
async init() {
const html = await this.getHtml();
const listData: Movie[] = await this.processingData(html);
// resultData: Content | undefined为什么会有undefined
// 因为在storageData方法中,如果文件不存在,会返回undefined
const resultData: Content | undefined = this.storageData({
time: this.time,
data: listData
});
if (!resultData) return;
this.createJsonFile(resultData);
};
}

const crawler = new Crawler();

使用组合设计模式优化代码

思路:

  • 公共部分提取出来——爬取策略
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
/*
* @Description:
* @Author: xiuji
* @Date: 2023-07-05 16:54:21
* @LastEditTime: 2023-08-07 15:09:09
* @LastEditors: Do not edit
*/
import fs from 'fs'; // nodejs自带的文件模块
import path from 'path'; // nodejs自带的路径模块
import superangent from 'superagent'; // 爬虫
import NowPlaying from './nowPlaying';

export interface NowPlay {
analyse: (html: string, time: string, filePath: string) => Promise<string | undefined>;
}

class Crawler {
// 查询时间
private time: string = '';
// 文件路径
private filePath: string = path.resolve(__dirname, '../data/movie.json');
constructor(private targetUrl: string, private nowplaying: NowPlay) {
this.time = this.formatTimestampToDateString(new Date().getTime());
this.init();
}
/**
* @Description: 获取网页html
* @Date: 2023-07-24 10:56:50
*/
async getHtml() {
const result = await superangent.get(this.targetUrl);
return result.text;
};

/**
* @Description: 格式化时间戳为日期字符串
* @Date: 2023-07-31 15:20:33
* @param {number} timestamp
*/
formatTimestampToDateString(timestamp: number): string {
const date = new Date(timestamp);
const year = date.getFullYear();
const month = String(date.getMonth() + 1).padStart(2, '0'); // Month is zero-based, so we add 1
const day = String(date.getDate()).padStart(2, '0');
const formattedDateString = `${year}-${month}-${day}`;
return formattedDateString;
};

/**
* @Description: 创建json文件
* @Date: 2023-08-01 13:54:33
* @param {Content} fileContent
*/
createJsonFile(fileContent: string) {
fs.writeFileSync(this.filePath, fileContent);
}

async init() {
const html = await this.getHtml();
const resultData: string | undefined = await this.nowplaying.analyse(html, this.time, this.filePath);
if (!resultData) return;
this.createJsonFile(resultData);
};
}

const url = 'https://movie.douban.com/cinema/nowplaying/nanjing/';
const nowPlaying = new NowPlaying();
new Crawler(url, nowPlaying);
  • 自定义部分单独归为一个类中——处理获得的数据,最后return给爬取策略的类进行存储操作
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*
* @Description:
* @Author: xiuji
* @Date: 2023-08-07 09:32:16
* @LastEditTime: 2023-08-07 15:09:35
* @LastEditors: Do not edit
*/
import fs from 'fs'; // nodejs自带的文件模块
import cheerio from 'cheerio'; // 服务端的jquery
import { NowPlay } from './index';
interface Movie {
post?: string;
name?: string;
}

interface MovieJson {
time: string;
data: Movie[];
}

interface Content {
[propName: string]: Movie[];
}

export default class NowPlaying implements NowPlay {
/**
* @Description: 处理数据
* @Date: 2023-07-24 10:03:37
* @param {string} html
*/
private async processingData(html: string): Promise<Movie[]> {
let movieList: Movie[] = [];
const $ = cheerio.load(html);
// 获取正在上映的电影列表
const playingLists = $('#nowplaying .list-item');
playingLists.each((i, ele) => {
const moviePoster = $(ele).find('.poster a').attr('href');
const movieName = $(ele).find('.stitle a').attr('title');
movieList.push(
{
post: moviePoster,
name: movieName
}
)
});
return Promise.resolve(movieList);
};

/**
* @Description: 调整数据格式
* @Author:
* @param {MovieJson} listData
* @param {string} filePath
* @return {string | undefined }
* @Date: 2023-08-07 10:23:53
*/
private storageData(listData: MovieJson, filePath: string): Content | undefined {
try {
let fileContent: Content = {};
// fs.existsSync() 判断文件是否存在
if (fs.existsSync(filePath)) {
fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
}
fileContent[listData.time] = listData.data;
return fileContent
} catch (error) {
console.log(error, 'storageData');
return undefined;
}
};

/**
* @Description: 分析数据
* @Author:
* @param {string} html
* @param {string} time
* @param {string} filePath
* @return {*}
* @Date: 2023-08-07 10:24:06
*/
public async analyse(html: string, time: string, filePath: string): Promise<string | undefined> {
const listData: Movie[] = await this.processingData(html);
const resultData = this.storageData({ time, data: listData }, filePath);
return Promise.resolve(JSON.stringify(resultData));
}
}

使用单例模式完善数据处理类

通过静态方法getInstance实现当前类只能创建一个实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
* @Description:
* @Author: xiuji
* @Date: 2023-08-07 09:32:16
* @LastEditTime: 2023-08-07 15:26:46
* @LastEditors: Do not edit
*/
import fs from 'fs'; // nodejs自带的文件模块
import cheerio from 'cheerio'; // 服务端的jquery
import { NowPlay } from './index';
interface Movie {
post?: string;
name?: string;
}

interface MovieJson {
time: string;
data: Movie[];
}

interface Content {
[propName: string]: Movie[];
}

export default class NowPlaying implements NowPlay {
private static instance: NowPlaying;
/**
* @Description: 类的静态方法,外部可不实例化直接调用
* @Author:
* @return {class} 当前类只能创建一个实例
* @Date: 2023-08-07 15:25:04
*/
static getInstance() {
if (!NowPlaying.instance) {
NowPlaying.instance = new NowPlaying();
}
return NowPlaying.instance;
}
/**
* @Description: 处理数据
* @Date: 2023-07-24 10:03:37
* @param {string} html
*/
private async processingData(html: string): Promise<Movie[]> {
let movieList: Movie[] = [];
const $ = cheerio.load(html);
// 获取正在上映的电影列表
const playingLists = $('#nowplaying .list-item');
playingLists.each((i, ele) => {
const moviePoster = $(ele).find('.poster a').attr('href');
const movieName = $(ele).find('.stitle a').attr('title');
movieList.push(
{
post: moviePoster,
name: movieName
}
)
});
return Promise.resolve(movieList);
};

/**
* @Description: 调整数据格式
* @Author:
* @param {MovieJson} listData
* @param {string} filePath
* @return {string | undefined }
* @Date: 2023-08-07 10:23:53
*/
private storageData(listData: MovieJson, filePath: string): Content | undefined {
try {
let fileContent: Content = {};
// fs.existsSync() 判断文件是否存在
if (fs.existsSync(filePath)) {
fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
}
fileContent[listData.time] = listData.data;
return fileContent
} catch (error) {
console.log(error, 'storageData');
return undefined;
}
};

/**
* @Description: 分析数据
* @Author:
* @param {string} html
* @param {string} time
* @param {string} filePath
* @return {*}
* @Date: 2023-08-07 10:24:06
*/
public async analyse(html: string, time: string, filePath: string): Promise<string | undefined> {
const listData: Movie[] = await this.processingData(html);
const resultData = this.storageData({ time, data: listData }, filePath);
return Promise.resolve(JSON.stringify(resultData));
}

private constructor() { }
}

调用爬虫

1
2
3
const url = 'https://movie.douban.com/cinema/nowplaying/nanjing/';
const nowPlaying = NowPlaying.getInstance();
new Crawler(url, nowPlaying);