提问人:Krystian Owczarek 提问时间:8/3/2023 更新时间:8/3/2023 访问量:48
在 Render 上部署后使用 Puppeteer 在 Node JS 服务器上抓取图像的问题
Issues with scraping images on a Node JS server using Puppeteer after deployment on Render
问:
我有一个服务器,在使用 Puppeteer 发送 POST 请求后,我打开一个页面,向下滚动到底部,然后才返回页面的 HTML 代码。我在这里的目标是发送图像的所有 URL,因为它们以无限滚动的方式加载。在本地,一切都按预期工作,这意味着它从页面返回所有图像。但是,在 Render 上部署后,请求仅返回 3 张图像。其原因可能是免费版本服务器中对RAM内存(512MB)的访问有限吗?
这是我的服务器:
const express = require('express');
const app = express();
const path = require('path')
const { port } = require('./config');
const bodyParser = require('body-parser');
const mime = require('mime')
const cors = require('cors')
const puppeteer = require('puppeteer')
// fix cors
app.use(cors())
app.use(express.static(path.join(__dirname, './')));
// headers
app.use((req, res, next) => {
res.setHeader('Access-Control-Allow-Origin', 'http://localhost:3000'); // Dostosuj '*', aby zezwalać tylko na konkretne domeny
res.setHeader('Access-Control-Allow-Methods', 'POST');
res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
res.setHeader("Content-Type", "application/json");
next();
});
// disabled buffor
app.disable('etag');
// mime
mime.define({
'application/json': ['json']
}, { force: true })
// pasery
//Content-type: application/json
app.use(bodyParser.json())
function scrollToBottom({
page,
distancePx,
speedMs,
scrollTimeoutMs,
eltToScroll,
}) {
return page.evaluate(
(distancePx, speedMs, scrollTimeoutMs, eltToScroll) => {
return new Promise((resolve) => {
const elt = document.querySelector(eltToScroll);
let totalHeight = 0;
const timer = setInterval(() => {
const scrollHeight = elt.scrollHeight;
window.scrollBy(0, distancePx);
totalHeight += distancePx;
if (totalHeight >= scrollHeight) {
clearInterval(timer);
resolve();
}
}, speedMs);
setTimeout(() => {
clearInterval(timer);
resolve();
}, scrollTimeoutMs);
});
},
distancePx,
speedMs,
scrollTimeoutMs,
eltToScroll
);
}
const scrollAndGetPageHTML = async (req, res) => {
// Set up Chromium browser and page.
const browser = await puppeteer.launch({
headless: true,
args: [
"--disable-setuid-sandbox",
"--no-sandbox",
"--single-process",
"--no-zygote",
],
executablePath:
process.env.NODE_ENV === "production"
? process.env.PUPPETEER_EXECUTABLE_PATH
: puppeteer.executablePath(),
});
try{
const page = await browser.newPage();
//default timeout to navigation-related function(goBack(), goForward(), goto(), reload(), setContent(), waitForNavigation())
page.setDefaultNavigationTimeout(0);
// //default timeout tto navigation function and waiting function(waitFor(), waitForFunction(), waitForRequest(), waitForResponse(), waitForSelector(), waitForXPath())
page.setDefaultTimeout(0);
// Navigate to the example page.
await page.goto(req.body.url);
await scrollToBottom({
page,
distancePx: 200,
speedMs: 50,
scrollTimeoutMs: 10000,
eltToScroll: "body"
})
const html = await page.content();
res.send(html);
} catch (error){
console.error(error);
res.send(`Something went wrong! Error: ${error}`);
} finally {
await browser.close();
}
};
app.post('/htmlCode', scrollAndGetPageHTML);
// server
app.listen(port, () => console.log(`Example app listening on port ${port}!`));
我在 Render 上使用 Dockerfile 安装了所有依赖项。
如何将生产中的所有图像 url 发送到我的网站?
答: 暂无答案
评论