Ts实现基础的爬虫

安装项目需要的依赖

1 2	pnpm i superagent pnpm i cheerio

起步

/*
 * @Description:
 * @Author: xiuji
 * @Date: 2023-07-05 16:54:21
 * @LastEditTime: 2023-08-01 14:09:15
 * @LastEditors: Do not edit
 */
import fs from 'fs'; // nodejs自带的文件模块
import path from 'path'; // nodejs自带的路径模块
import superangent from 'superagent'; // 爬虫
import cheerio from 'cheerio'; // 服务端的jquery

interface Movie {
    post?: string;
    name?: string;
}

interface MovieJson {
    time: string;
    data: Movie[];
}

interface Content {
    [propName: string]: Movie[];
}

class Crawler {
    // 目标网址
    private targetUrl: string = 'https://movie.douban.com/cinema/nowplaying/nanjing/';
    // 查询时间
    private time: string = '';
    // 文件路径
    private filePath: string = path.resolve(__dirname, '../data/movie.json');
    constructor() {
        this.time = this.formatTimestampToDateString(new Date().getTime());
        this.init();
    }
    /**
     * @Description: 获取网页html
     * @Date: 2023-07-24 10:56:50
     */
    async getHtml() {
        const result = await superangent.get(this.targetUrl);
        return result.text;
    };
    /**
     * @Description: 处理数据
     * @Date: 2023-07-24 10:03:37
     * @param {string} html
     */
    async processingData(html: string): Promise<Movie[]> {
        let movieList: Movie[] = [];
        const $ = cheerio.load(html);
        // 获取正在上映的电影列表
        const playingLists = $('#nowplaying .list-item');
        playingLists.each((i, ele) => {
            const moviePoster = $(ele).find('.poster a').attr('href');
            const movieName = $(ele).find('.stitle a').attr('title');
            movieList.push(
                {
                    post: moviePoster,
                    name: movieName
                }
            )
        });
        return Promise.resolve(movieList);
    };
    /**
     * @Description: 格式化时间戳为日期字符串
     * @Date: 2023-07-31 15:20:33
     * @param {number} timestamp
     */
    formatTimestampToDateString(timestamp: number): string {
        const date = new Date(timestamp);
        const year = date.getFullYear();
        const month = String(date.getMonth() + 1).padStart(2, '0'); // Month is zero-based, so we add 1
        const day = String(date.getDate()).padStart(2, '0');
        const formattedDateString = `${year}-${month}-${day}`;
        return formattedDateString;
    }
    /**
     * @Description: 存储数据
     * @Author: 
     * @param {MovieJson} listData
     * @return {*}
     * @Date: 2023-08-01 14:09:32
     */
    storageData(listData: MovieJson): Content | undefined {
        try {
            let fileContent: Content = {};
            // fs.existsSync() 判断文件是否存在
            if (fs.existsSync(this.filePath)) {
                fileContent = JSON.parse(fs.readFileSync(this.filePath, 'utf-8'));
            }
            fileContent[listData.time] = listData.data;
            return fileContent;
        } catch (error) {
            console.log(error);
        }
    };
    /**
     * @Description: 创建json文件
     * @Date: 2023-08-01 13:54:33
     * @param {Content} fileContent
     */
    createJsonFile(fileContent: Content) {
        fs.writeFileSync(this.filePath, JSON.stringify(fileContent));
    }
    async init() {
        const html = await this.getHtml();
        const listData: Movie[] = await this.processingData(html);
        // resultData: Content | undefined为什么会有undefined
        // 因为在storageData方法中，如果文件不存在，会返回undefined
        const resultData: Content | undefined = this.storageData({
            time: this.time,
            data: listData
        });
        if (!resultData) return;
        this.createJsonFile(resultData);
    };
}

const crawler = new Crawler();

使用组合设计模式优化代码

思路：

公共部分提取出来——爬取策略

/*
 * @Description:
 * @Author: xiuji
 * @Date: 2023-07-05 16:54:21
 * @LastEditTime: 2023-08-07 15:09:09
 * @LastEditors: Do not edit
 */
import fs from 'fs'; // nodejs自带的文件模块
import path from 'path'; // nodejs自带的路径模块
import superangent from 'superagent'; // 爬虫
import NowPlaying from './nowPlaying';

export interface NowPlay {
    analyse: (html: string, time: string, filePath: string) => Promise<string | undefined>;
}

class Crawler {
    // 查询时间
    private time: string = '';
    // 文件路径
    private filePath: string = path.resolve(__dirname, '../data/movie.json');
    constructor(private targetUrl: string, private nowplaying: NowPlay) {
        this.time = this.formatTimestampToDateString(new Date().getTime());
        this.init();
    }
    /**
     * @Description: 获取网页html
     * @Date: 2023-07-24 10:56:50
     */
    async getHtml() {
        const result = await superangent.get(this.targetUrl);
        return result.text;
    };

    /**
     * @Description: 格式化时间戳为日期字符串
     * @Date: 2023-07-31 15:20:33
     * @param {number} timestamp
     */
    formatTimestampToDateString(timestamp: number): string {
        const date = new Date(timestamp);
        const year = date.getFullYear();
        const month = String(date.getMonth() + 1).padStart(2, '0'); // Month is zero-based, so we add 1
        const day = String(date.getDate()).padStart(2, '0');
        const formattedDateString = `${year}-${month}-${day}`;
        return formattedDateString;
    };

    /**
     * @Description: 创建json文件
     * @Date: 2023-08-01 13:54:33
     * @param {Content} fileContent
     */
    createJsonFile(fileContent: string) {
        fs.writeFileSync(this.filePath, fileContent);
    }

    async init() {
        const html = await this.getHtml();
        const resultData: string | undefined = await this.nowplaying.analyse(html, this.time, this.filePath);
        if (!resultData) return;
        this.createJsonFile(resultData);
    };
}

const url = 'https://movie.douban.com/cinema/nowplaying/nanjing/';
const nowPlaying = new NowPlaying();
new Crawler(url, nowPlaying);

自定义部分单独归为一个类中——处理获得的数据，最后return给爬取策略的类进行存储操作

/*
 * @Description: 
 * @Author: xiuji
 * @Date: 2023-08-07 09:32:16
 * @LastEditTime: 2023-08-07 15:09:35
 * @LastEditors: Do not edit
 */
import fs from 'fs'; // nodejs自带的文件模块
import cheerio from 'cheerio'; // 服务端的jquery
import { NowPlay } from './index';
interface Movie {
    post?: string;
    name?: string;
}

interface MovieJson {
    time: string;
    data: Movie[];
}

interface Content {
    [propName: string]: Movie[];
}

export default class NowPlaying implements NowPlay {
    /**
     * @Description: 处理数据
     * @Date: 2023-07-24 10:03:37
     * @param {string} html
     */
    private async processingData(html: string): Promise<Movie[]> {
        let movieList: Movie[] = [];
        const $ = cheerio.load(html);
        // 获取正在上映的电影列表
        const playingLists = $('#nowplaying .list-item');
        playingLists.each((i, ele) => {
            const moviePoster = $(ele).find('.poster a').attr('href');
            const movieName = $(ele).find('.stitle a').attr('title');
            movieList.push(
                {
                    post: moviePoster,
                    name: movieName
                }
            )
        });
        return Promise.resolve(movieList);
    };

    /**
     * @Description: 调整数据格式
     * @Author: 
     * @param {MovieJson} listData
     * @param {string} filePath
     * @return {string | undefined } 
     * @Date: 2023-08-07 10:23:53
     */
    private storageData(listData: MovieJson, filePath: string): Content | undefined {
        try {
            let fileContent: Content = {};
            // fs.existsSync() 判断文件是否存在
            if (fs.existsSync(filePath)) {
                fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
            }
            fileContent[listData.time] = listData.data;
            return fileContent
        } catch (error) {
            console.log(error, 'storageData');
            return undefined;
        }
    };

    /**
     * @Description: 分析数据
     * @Author: 
     * @param {string} html
     * @param {string} time
     * @param {string} filePath
     * @return {*}
     * @Date: 2023-08-07 10:24:06
     */
    public async analyse(html: string, time: string, filePath: string): Promise<string | undefined> {
        const listData: Movie[] = await this.processingData(html);
        const resultData = this.storageData({ time, data: listData }, filePath);
        return Promise.resolve(JSON.stringify(resultData));
    }
}

使用单例模式完善数据处理类

通过静态方法getInstance实现当前类只能创建一个实例

/*
 * @Description: 
 * @Author: xiuji
 * @Date: 2023-08-07 09:32:16
 * @LastEditTime: 2023-08-07 15:26:46
 * @LastEditors: Do not edit
 */
import fs from 'fs'; // nodejs自带的文件模块
import cheerio from 'cheerio'; // 服务端的jquery
import { NowPlay } from './index';
interface Movie {
    post?: string;
    name?: string;
}

interface MovieJson {
    time: string;
    data: Movie[];
}

interface Content {
    [propName: string]: Movie[];
}

export default class NowPlaying implements NowPlay {
    private static instance: NowPlaying;
    /**
     * @Description: 类的静态方法，外部可不实例化直接调用
     * @Author: 
     * @return {class} 当前类只能创建一个实例
     * @Date: 2023-08-07 15:25:04
     */
    static getInstance() {
        if (!NowPlaying.instance) {
            NowPlaying.instance = new NowPlaying();
        }
        return NowPlaying.instance;
    }
    /**
     * @Description: 处理数据
     * @Date: 2023-07-24 10:03:37
     * @param {string} html
     */
    private async processingData(html: string): Promise<Movie[]> {
        let movieList: Movie[] = [];
        const $ = cheerio.load(html);
        // 获取正在上映的电影列表
        const playingLists = $('#nowplaying .list-item');
        playingLists.each((i, ele) => {
            const moviePoster = $(ele).find('.poster a').attr('href');
            const movieName = $(ele).find('.stitle a').attr('title');
            movieList.push(
                {
                    post: moviePoster,
                    name: movieName
                }
            )
        });
        return Promise.resolve(movieList);
    };

    /**
     * @Description: 调整数据格式
     * @Author: 
     * @param {MovieJson} listData
     * @param {string} filePath
     * @return {string | undefined } 
     * @Date: 2023-08-07 10:23:53
     */
    private storageData(listData: MovieJson, filePath: string): Content | undefined {
        try {
            let fileContent: Content = {};
            // fs.existsSync() 判断文件是否存在
            if (fs.existsSync(filePath)) {
                fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
            }
            fileContent[listData.time] = listData.data;
            return fileContent
        } catch (error) {
            console.log(error, 'storageData');
            return undefined;
        }
    };

    /**
     * @Description: 分析数据
     * @Author: 
     * @param {string} html
     * @param {string} time
     * @param {string} filePath
     * @return {*}
     * @Date: 2023-08-07 10:24:06
     */
    public async analyse(html: string, time: string, filePath: string): Promise<string | undefined> {
        const listData: Movie[] = await this.processingData(html);
        const resultData = this.storageData({ time, data: listData }, filePath);
        return Promise.resolve(JSON.stringify(resultData));
    }

    private constructor() { }
}

调用爬虫

1
2
3

const url = 'https://movie.douban.com/cinema/nowplaying/nanjing/';
const nowPlaying = NowPlaying.getInstance();
new Crawler(url, nowPlaying);