在 Render 上部署后使用 Puppeteer 在 Node JS 服务器上抓取图像的问题

Issues with scraping images on a Node JS server using Puppeteer after deployment on Render

提问人:Krystian Owczarek 提问时间:8/3/2023 更新时间:8/3/2023 访问量:48

问:

我有一个服务器,在使用 Puppeteer 发送 POST 请求后,我打开一个页面,向下滚动到底部,然后才返回页面的 HTML 代码。我在这里的目标是发送图像的所有 URL,因为它们以无限滚动的方式加载。在本地,一切都按预期工作,这意味着它从页面返回所有图像。但是,在 Render 上部署后,请求仅返回 3 张图像。其原因可能是免费版本服务器中对RAM内存(512MB)的访问有限吗?

这是我的服务器:

const express = require('express');
const app = express();
const path = require('path')
const { port } = require('./config');
const bodyParser = require('body-parser');
const mime = require('mime')
const cors = require('cors')
const puppeteer = require('puppeteer')

// fix cors
app.use(cors())

app.use(express.static(path.join(__dirname, './')));

// headers
app.use((req, res, next) => {
  res.setHeader('Access-Control-Allow-Origin', 'http://localhost:3000'); // Dostosuj '*', aby zezwalać tylko na konkretne domeny
  res.setHeader('Access-Control-Allow-Methods', 'POST');
  res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
  res.setHeader("Content-Type", "application/json");
  next();
});

// disabled buffor
app.disable('etag');

// mime
mime.define({
  'application/json': ['json']
}, { force: true })

// pasery
//Content-type: application/json
app.use(bodyParser.json())

function scrollToBottom({
  page,
  distancePx,
  speedMs,
  scrollTimeoutMs,
  eltToScroll,
}) {
  return page.evaluate(
    (distancePx, speedMs, scrollTimeoutMs, eltToScroll) => {
      return new Promise((resolve) => {
        const elt = document.querySelector(eltToScroll);
        let totalHeight = 0;
        const timer = setInterval(() => {
          const scrollHeight = elt.scrollHeight;
          window.scrollBy(0, distancePx);
          totalHeight += distancePx;

          if (totalHeight >= scrollHeight) {
            clearInterval(timer);
            resolve();
          }
        }, speedMs);

        setTimeout(() => {
          clearInterval(timer);
          resolve();
        }, scrollTimeoutMs);
      });
    },
    distancePx,
    speedMs,
    scrollTimeoutMs,
    eltToScroll
  );
}
 

const scrollAndGetPageHTML = async (req, res) => {
  // Set up Chromium browser and page.
  const browser = await puppeteer.launch({
    headless: true,
    args: [
      "--disable-setuid-sandbox",
      "--no-sandbox",
      "--single-process",
      "--no-zygote",
    ],
    executablePath:
      process.env.NODE_ENV === "production"
        ? process.env.PUPPETEER_EXECUTABLE_PATH
        : puppeteer.executablePath(),
  });

  try{
    const page = await browser.newPage();

    //default timeout to navigation-related function(goBack(), goForward(), goto(), reload(), setContent(), waitForNavigation())
    page.setDefaultNavigationTimeout(0);
    // //default timeout tto navigation function and waiting function(waitFor(), waitForFunction(), waitForRequest(), waitForResponse(), waitForSelector(), waitForXPath())
    page.setDefaultTimeout(0); 
    // Navigate to the example page.
    await page.goto(req.body.url);
  
    await scrollToBottom({
      page,
      distancePx: 200,
      speedMs: 50,
      scrollTimeoutMs: 10000,
      eltToScroll: "body" 
    })

    const html = await page.content();
    res.send(html);
  } catch (error){
    console.error(error);
    res.send(`Something went wrong! Error: ${error}`);
  } finally {
    await browser.close();
  }
};

app.post('/htmlCode', scrollAndGetPageHTML);


// server
app.listen(port, () => console.log(`Example app listening on port ${port}!`));
  

我在 Render 上使用 Dockerfile 安装了所有依赖项。

如何将生产中的所有图像 url 发送到我的网站?

节点 .js 网页抓取 木偶师 Chromium 无限滚动

评论


答: 暂无答案