Merge pull request #275 from mendableai/feat/issue-273
Added pageOptions.removeTags
This commit is contained in:
commit
6fc1ee32fd
@ -61,6 +61,13 @@
|
|||||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||||
"default": 0
|
"default": 0
|
||||||
},
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
"headers": {
|
"headers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||||
@ -194,6 +201,11 @@
|
|||||||
"type": "integer",
|
"type": "integer",
|
||||||
"description": "Maximum number of pages to crawl",
|
"description": "Maximum number of pages to crawl",
|
||||||
"default": 10000
|
"default": 10000
|
||||||
|
},
|
||||||
|
"allowBackwardCrawling": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Allow backward crawling (crawl from the base URL to the previous URLs)",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -219,6 +231,13 @@
|
|||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||||
},
|
},
|
||||||
|
"removeTags": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "string"
|
||||||
|
},
|
||||||
|
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||||
|
},
|
||||||
"replaceAllPathsWithAbsolutePaths": {
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
"type": "boolean",
|
"type": "boolean",
|
||||||
"description": "Replace all relative paths with absolute paths for images and links",
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
@ -135,6 +135,40 @@ describe("E2E Tests for API Routes", () => {
|
|||||||
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
expect(response.body.data.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||||
}, 60000); // 60 seconds
|
}, 60000); // 60 seconds
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||||
|
const responseWithoutRemoveTags = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/" });
|
||||||
|
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||||
|
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("content");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||||
|
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("[Sandbox]("); // .nav
|
||||||
|
expect(responseWithoutRemoveTags.body.data.content).toContain("web scraping"); // strong
|
||||||
|
|
||||||
|
const response = await request(TEST_URL)
|
||||||
|
.post("/v0/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://www.scrapethissite.com/", pageOptions: { removeTags: ['.nav', '#footer', 'strong'] } });
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("data");
|
||||||
|
expect(response.body.data).toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.content).toContain("Scrape This Site");
|
||||||
|
expect(response.body.data.content).not.toContain("Lessons and Videos"); // #footer
|
||||||
|
expect(response.body.data.content).not.toContain("[Sandbox]("); // .nav
|
||||||
|
expect(response.body.data.content).not.toContain("web scraping"); // strong
|
||||||
|
}, 30000); // 30 seconds timeout
|
||||||
|
|
||||||
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
// TODO: add this test back once we nail the waitFor option to be more deterministic
|
||||||
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
// it.concurrent("should return a successful response with a valid API key and waitFor option", async () => {
|
||||||
// const startTime = Date.now();
|
// const startTime = Date.now();
|
||||||
|
@ -55,8 +55,14 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? { allowBackwardCrawling: false };
|
const crawlerOptions = req.body.crawlerOptions ?? {
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
allowBackwardCrawling: false
|
||||||
|
};
|
||||||
|
const pageOptions = req.body.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
removeTags: []
|
||||||
|
};
|
||||||
|
|
||||||
if (mode === "single_urls" && !url.includes(",")) {
|
if (mode === "single_urls" && !url.includes(",")) {
|
||||||
try {
|
try {
|
||||||
|
@ -26,7 +26,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||||
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
const pageOptions = req.body.pageOptions ?? { onlyMainContent: false, includeHtml: false, removeTags: [] };
|
||||||
|
|
||||||
const job = await addWebScraperJob({
|
const job = await addWebScraperJob({
|
||||||
url: url,
|
url: url,
|
||||||
|
@ -85,6 +85,7 @@ export async function searchHelper(
|
|||||||
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
onlyMainContent: pageOptions?.onlyMainContent ?? true,
|
||||||
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
fetchPageContent: pageOptions?.fetchPageContent ?? true,
|
||||||
includeHtml: pageOptions?.includeHtml ?? false,
|
includeHtml: pageOptions?.includeHtml ?? false,
|
||||||
|
removeTags: pageOptions?.removeTags ?? [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
@ -139,6 +140,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
includeHtml: false,
|
includeHtml: false,
|
||||||
onlyMainContent: true,
|
onlyMainContent: true,
|
||||||
fetchPageContent: true,
|
fetchPageContent: true,
|
||||||
|
removeTags: [],
|
||||||
fallback: false,
|
fallback: false,
|
||||||
};
|
};
|
||||||
const origin = req.body.origin ?? "api";
|
const origin = req.body.origin ?? "api";
|
||||||
|
@ -19,6 +19,7 @@ export type PageOptions = {
|
|||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
|
removeTags?: string | string[];
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -475,7 +475,12 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
|
this.pageOptions = options.pageOptions ?? {
|
||||||
|
onlyMainContent: false,
|
||||||
|
includeHtml: false,
|
||||||
|
replaceAllPathsWithAbsolutePaths: false,
|
||||||
|
removeTags: []
|
||||||
|
};
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
|
@ -304,6 +304,19 @@ export async function scrapSingleUrl(
|
|||||||
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||||
const soup = cheerio.load(html);
|
const soup = cheerio.load(html);
|
||||||
soup("script, style, iframe, noscript, meta, head").remove();
|
soup("script, style, iframe, noscript, meta, head").remove();
|
||||||
|
|
||||||
|
if (pageOptions.removeTags) {
|
||||||
|
if (typeof pageOptions.removeTags === 'string') {
|
||||||
|
pageOptions.removeTags.split(',').forEach((tag) => {
|
||||||
|
soup(tag.trim()).remove();
|
||||||
|
});
|
||||||
|
} else if (Array.isArray(pageOptions.removeTags)) {
|
||||||
|
pageOptions.removeTags.forEach((tag) => {
|
||||||
|
soup(tag).remove();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (pageOptions.onlyMainContent) {
|
if (pageOptions.onlyMainContent) {
|
||||||
// remove any other tags that are not in the main content
|
// remove any other tags that are not in the main content
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
|
Loading…
Reference in New Issue
Block a user