如何详细配置 aws lamda 以使用 Puppeteer,因为我是 lambda 中的 Begineer 以及 Puppeteer

How to configure aws lamda to work with Puppeteer in detail, As I'm Begineer in lambda as well as puppeteer

提问人:Rishabh Shukla 提问时间:11/6/2023 最后编辑:Rishabh Shukla 更新时间:11/7/2023 访问量:39

问:

我package.json:

"dependencies": {
"chrome-aws-lambda": "^10.1.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-core": "^10.1.0"

}

由于我无法加载整个站点,因此它在加载时停止并出现空错误 配置 (aws lambda):-

  1. 节点 - 14.0
  2. 内存 - 2000MB
  3. 临时存储 - 1024MB 我不知道我在 lambda 的配置上做错了什么,因为我在 s3 上上传了代码,然后在 main 函数上上传了代码:

法典:

    const pupeteerExtra = require('puppeteer-extra');
    const pupeteerExtraPluginStealth = require('puppeteer-extra-plugin-stealth');
    const { DOMParser } = require('domino');
    const chromium = require('chrome-aws-lambda');

    const waitTillHTMLRendered = async (page, timeout = 30000) => {
    const checkDurationMsecs = 1000;
    const maxChecks = timeout / checkDurationMsecs;
    let lastHTMLSize = 0;
    let checkCounts = 1;
    let countStableSizeIterations = 0;
    const minStableSizeIterations = 3;

    while (checkCounts++ <= maxChecks) {
        let html = await page.content();
        let currentHTMLSize = html.length;

        let bodyHTMLSize = await page.evaluate(() => document.body.innerHTML.length);

        console.log('last: ', lastHTMLSize, ' <> curr: ', currentHTMLSize, " body html size: ", bodyHTMLSize);

        if (lastHTMLSize != 0 && currentHTMLSize == lastHTMLSize)
            countStableSizeIterations++;
        else
            countStableSizeIterations = 0; //reset the counter

        if (countStableSizeIterations >= minStableSizeIterations) {
            console.log("Page rendered fully..");
            break;
        }

        lastHTMLSize = currentHTMLSize;
        await page.waitForTimeout(checkDurationMsecs);
    }
};


    async function getData(page) {
    try {

        const cancel = await page.waitForSelector('#mat-dialog-title-0 > span',)
        await cancel.click()
    } catch (error) {
        const data = await page.evaluate(() => {
            const notAvailable = 'No flights available';
            const head = document.querySelector("h1").textContent.trim();
            const classCabins = document.querySelectorAll(".cabin-heading");
            const classCabin = []
            classCabins.forEach((element) => {
                classCabin.push(element.textContent.trim());
            })


            if (head === notAvailable) {
                return false;
            }
            const elements = document.querySelectorAll('.upsell-row.stop-over.ng-star-inserted'); // Selector for elements


            const scrapedData = [];

            elements.forEach(async (element) => {


                const data = [];

                const cabinDiv = element.querySelector('.cabins-container.ng-star-inserted').outerHTML;
                const extractedPage = new DOMParser().parseFromString(cabinDiv, 'text/html');
                const cabins = Array.from(extractedPage.querySelectorAll("kilo-cabin-cell-pres"));

                cabins.forEach((cabin) => {
                    const seatsLeftElement = cabin.querySelector('.seat-text.ng-star-inserted');
                    const pointsElemnet = cabin.querySelector('.points-total');
                    const cashElement = cabin.querySelector('.remaining-cash');
                    const cabinclassElemnt = cabin.querySelector('.mixed-cabin.good.ng-star-inserted');

                    const seatLeft = seatsLeftElement ? seatsLeftElement.textContent.trim() : '';
                    const points = pointsElemnet ? pointsElemnet.textContent.trim() : '';
                    const cash = cashElement ? cashElement.textContent.trim() : '';
                    const mixedCabin = cabinclassElemnt ? cabinclassElemnt.textContent.trim() : '';
                    data.push({
                        seatLeft,
                        points,
                        cash,
                        mixedCabin,
                    });
                });


                const departureTimeElement = element.querySelector('.departure-time');
                const arrivalTimeElement = element.querySelector('.arrival-time');
                const durationElement = element.querySelector('.flight-summary.ng-star-inserted');
                const layoverElements = Array.from(element.querySelectorAll('.connection-time.mat-caption.ng-star-inserted'));
                const operatingAirlineElement = element.querySelector('.operating-airline-icon');
                const specificClassElement = element.querySelector('.cabin-text');

                const departureTime = departureTimeElement ? departureTimeElement.textContent.trim() : 'Not available';
                const arrivalTime = arrivalTimeElement ? arrivalTimeElement.textContent.trim() : 'Not available';
                const duration = durationElement ? durationElement.textContent.trim() : 'Not available';
                const layover = layoverElements.map((layoverElement) => layoverElement.textContent.trim());
                const operatingAirline = operatingAirlineElement ? operatingAirlineElement.getAttribute('alt') : 'Not available';
                const specificClass = specificClassElement ? specificClassElement.textContent.trim() : 'Not available';

                // Create an object with the extracted data
                const flightData = {
                    departureTime,
                    arrivalTime,
                    duration,
                    layover,
                    operatingAirline,
                    specificClass,
                    data
                };

                // Add the object to the scrapedData array
                scrapedData.push(flightData);
            });

            return JSON.stringify({ classCabin, scrapedData });
        });
        console.log(data);
        return data;

    }
}


     exports.handler = async (event, context, callback) => {
    const url = 'https://www.aircanada.com/aeroplan/redeem/availability/outbound?org0=DEL&dest0=YYC&departureDate0=2023-12-08&lang=en-CA&tripType=O&ADT=1&YTH=0&CHD=0&INF=0&INS=0&marketCode=INT';
    try {
        console.log(chromium.headless)
        pupeteerExtra.use(pupeteerExtraPluginStealth());
        // const args = chromium.args.filter(item => item !== '--use-gl=swiftshader');
        // console.log(args)
        const browser = await pupeteerExtra.launch({
             args: [
            '--disable-gpu',
            '--disable-dev-shm-usage',
            '--disable-setuid-sandbox',
            '--no-first-run',
            '--no-sandbox',
            '--no-zygote',
            '--single-process', // <- this one doesn't works in Windows
        ],
            defaultViewport: chromium.defaultViewport,
            executablePath: await chromium.executablePath,
            headless: true,
            ignoreHTTPSErrors: true,
        });
        const page = await browser.newPage();
        page.setDefaultNavigationTimeout(0);
        page.setDefaultTimeout(0);
        await page.goto(url, { waitUntil: 'networkidle0' });
       await waitTillHTMLRendered(page)
       const pageTitle =  await getData(page);
       console.log('vande ma')
        return pageTitle
        // return callback(null, pageTitle);
    } catch (error) {
        console.log('Error at test.js:', error.message)
    }
};

 


T

我试过层但很薄,所以我做错了什么 Layer1 - package.json中所需软件包的所有节点模块,不包括 domino Layer2 - Domino 节点模块和所有文件 图层 - 主处理程序文件 这是我试图抓取的链接,以获取航班详细信息 https://www.aircanada.com/aeroplan/redeem/availability/outbound?org0=DEL&dest0=YYC&departureDate0=2023-12-08&lang=en-CA&tripType=O&ADT=1&YTH=0&CHD=0&INF=0&INS=0&marketCode=IN

网页抓 取傀儡师 chromium aws-sdk-nodejs aws-lambda-layers

评论


答: 暂无答案