Ts实现基础的爬虫
安装项目需要的依赖
1 2
| pnpm i superagent pnpm i cheerio
|
起步
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
|
import fs from 'fs'; import path from 'path'; import superangent from 'superagent'; import cheerio from 'cheerio';
interface Movie { post?: string; name?: string; }
interface MovieJson { time: string; data: Movie[]; }
interface Content { [propName: string]: Movie[]; }
class Crawler { private targetUrl: string = 'https://movie.douban.com/cinema/nowplaying/nanjing/'; private time: string = ''; private filePath: string = path.resolve(__dirname, '../data/movie.json'); constructor() { this.time = this.formatTimestampToDateString(new Date().getTime()); this.init(); }
async getHtml() { const result = await superangent.get(this.targetUrl); return result.text; };
async processingData(html: string): Promise<Movie[]> { let movieList: Movie[] = []; const $ = cheerio.load(html); const playingLists = $('#nowplaying .list-item'); playingLists.each((i, ele) => { const moviePoster = $(ele).find('.poster a').attr('href'); const movieName = $(ele).find('.stitle a').attr('title'); movieList.push( { post: moviePoster, name: movieName } ) }); return Promise.resolve(movieList); };
formatTimestampToDateString(timestamp: number): string { const date = new Date(timestamp); const year = date.getFullYear(); const month = String(date.getMonth() + 1).padStart(2, '0'); const day = String(date.getDate()).padStart(2, '0'); const formattedDateString = `${year}-${month}-${day}`; return formattedDateString; }
storageData(listData: MovieJson): Content | undefined { try { let fileContent: Content = {}; if (fs.existsSync(this.filePath)) { fileContent = JSON.parse(fs.readFileSync(this.filePath, 'utf-8')); } fileContent[listData.time] = listData.data; return fileContent; } catch (error) { console.log(error); } };
createJsonFile(fileContent: Content) { fs.writeFileSync(this.filePath, JSON.stringify(fileContent)); } async init() { const html = await this.getHtml(); const listData: Movie[] = await this.processingData(html); const resultData: Content | undefined = this.storageData({ time: this.time, data: listData }); if (!resultData) return; this.createJsonFile(resultData); }; }
const crawler = new Crawler();
|
使用组合设计模式优化代码
思路:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
|
import fs from 'fs'; import path from 'path'; import superangent from 'superagent'; import NowPlaying from './nowPlaying';
export interface NowPlay { analyse: (html: string, time: string, filePath: string) => Promise<string | undefined>; }
class Crawler { private time: string = ''; private filePath: string = path.resolve(__dirname, '../data/movie.json'); constructor(private targetUrl: string, private nowplaying: NowPlay) { this.time = this.formatTimestampToDateString(new Date().getTime()); this.init(); }
async getHtml() { const result = await superangent.get(this.targetUrl); return result.text; };
formatTimestampToDateString(timestamp: number): string { const date = new Date(timestamp); const year = date.getFullYear(); const month = String(date.getMonth() + 1).padStart(2, '0'); const day = String(date.getDate()).padStart(2, '0'); const formattedDateString = `${year}-${month}-${day}`; return formattedDateString; };
createJsonFile(fileContent: string) { fs.writeFileSync(this.filePath, fileContent); }
async init() { const html = await this.getHtml(); const resultData: string | undefined = await this.nowplaying.analyse(html, this.time, this.filePath); if (!resultData) return; this.createJsonFile(resultData); }; }
const url = 'https://movie.douban.com/cinema/nowplaying/nanjing/'; const nowPlaying = new NowPlaying(); new Crawler(url, nowPlaying);
|
- 自定义部分单独归为一个类中——处理获得的数据,最后return给爬取策略的类进行存储操作
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
|
import fs from 'fs'; import cheerio from 'cheerio'; import { NowPlay } from './index'; interface Movie { post?: string; name?: string; }
interface MovieJson { time: string; data: Movie[]; }
interface Content { [propName: string]: Movie[]; }
export default class NowPlaying implements NowPlay {
private async processingData(html: string): Promise<Movie[]> { let movieList: Movie[] = []; const $ = cheerio.load(html); const playingLists = $('#nowplaying .list-item'); playingLists.each((i, ele) => { const moviePoster = $(ele).find('.poster a').attr('href'); const movieName = $(ele).find('.stitle a').attr('title'); movieList.push( { post: moviePoster, name: movieName } ) }); return Promise.resolve(movieList); };
private storageData(listData: MovieJson, filePath: string): Content | undefined { try { let fileContent: Content = {}; if (fs.existsSync(filePath)) { fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8')); } fileContent[listData.time] = listData.data; return fileContent } catch (error) { console.log(error, 'storageData'); return undefined; } };
public async analyse(html: string, time: string, filePath: string): Promise<string | undefined> { const listData: Movie[] = await this.processingData(html); const resultData = this.storageData({ time, data: listData }, filePath); return Promise.resolve(JSON.stringify(resultData)); } }
|
使用单例模式完善数据处理类
通过静态方法getInstance实现当前类只能创建一个实例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
|
import fs from 'fs'; import cheerio from 'cheerio'; import { NowPlay } from './index'; interface Movie { post?: string; name?: string; }
interface MovieJson { time: string; data: Movie[]; }
interface Content { [propName: string]: Movie[]; }
export default class NowPlaying implements NowPlay { private static instance: NowPlaying;
static getInstance() { if (!NowPlaying.instance) { NowPlaying.instance = new NowPlaying(); } return NowPlaying.instance; }
private async processingData(html: string): Promise<Movie[]> { let movieList: Movie[] = []; const $ = cheerio.load(html); const playingLists = $('#nowplaying .list-item'); playingLists.each((i, ele) => { const moviePoster = $(ele).find('.poster a').attr('href'); const movieName = $(ele).find('.stitle a').attr('title'); movieList.push( { post: moviePoster, name: movieName } ) }); return Promise.resolve(movieList); };
private storageData(listData: MovieJson, filePath: string): Content | undefined { try { let fileContent: Content = {}; if (fs.existsSync(filePath)) { fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8')); } fileContent[listData.time] = listData.data; return fileContent } catch (error) { console.log(error, 'storageData'); return undefined; } };
public async analyse(html: string, time: string, filePath: string): Promise<string | undefined> { const listData: Movie[] = await this.processingData(html); const resultData = this.storageData({ time, data: listData }, filePath); return Promise.resolve(JSON.stringify(resultData)); }
private constructor() { } }
|
调用爬虫
1 2 3
| const url = 'https://movie.douban.com/cinema/nowplaying/nanjing/'; const nowPlaying = NowPlaying.getInstance(); new Crawler(url, nowPlaying);
|