Merge remote-tracking branch 'origin/main' into feat/allowbackwardcrawling-option
This commit is contained in:
commit
dc6acbf1f0
20
.github/workflows/clean-before-24h-complete-jobs.yml
vendored
Normal file
20
.github/workflows/clean-before-24h-complete-jobs.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
name: Clean Before 24h Completed Jobs
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 0 * * *'
|
||||||
|
|
||||||
|
env:
|
||||||
|
BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }}
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
clean-jobs:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Send GET request to clean jobs
|
||||||
|
run: |
|
||||||
|
response=$(curl --write-out '%{http_code}' --silent --output /dev/null https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs)
|
||||||
|
if [ "$response" -ne 200 ]; then
|
||||||
|
echo "Failed to clean jobs. Response: $response"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Successfully cleaned jobs. Response: $response"
|
@ -218,6 +218,11 @@
|
|||||||
"headers": {
|
"headers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
"description": "Headers to send with the request when scraping. Can be used to send cookies, user-agent, etc."
|
||||||
|
},
|
||||||
|
"replaceAllPathsWithAbsolutePaths": {
|
||||||
|
"type": "boolean",
|
||||||
|
"description": "Replace all relative paths with absolute paths for images and links",
|
||||||
|
"default": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -164,6 +164,27 @@ app.get('/serverHealthCheck/notify', async (req, res) => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
app.get(`/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, async (req, res) => {
|
||||||
|
try {
|
||||||
|
const webScraperQueue = getWebScraperQueue();
|
||||||
|
const completedJobs = await webScraperQueue.getJobs(['completed']);
|
||||||
|
const before24hJobs = completedJobs.filter(job => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000);
|
||||||
|
const jobIds = before24hJobs.map(job => job.id) as string[];
|
||||||
|
let count = 0;
|
||||||
|
for (const jobId of jobIds) {
|
||||||
|
try {
|
||||||
|
await webScraperQueue.removeJobs(jobId);
|
||||||
|
count++;
|
||||||
|
} catch (jobError) {
|
||||||
|
console.error(`Failed to remove job with ID ${jobId}:`, jobError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
res.status(200).send(`Removed ${count} completed jobs.`);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Failed to clean last 24h complete jobs:', error);
|
||||||
|
res.status(500).send('Failed to clean jobs');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
app.get("/is-production", (req, res) => {
|
app.get("/is-production", (req, res) => {
|
||||||
res.send({ isProduction: global.isProduction });
|
res.send({ isProduction: global.isProduction });
|
||||||
|
@ -18,6 +18,7 @@ export type PageOptions = {
|
|||||||
waitFor?: number;
|
waitFor?: number;
|
||||||
screenshot?: boolean;
|
screenshot?: boolean;
|
||||||
headers?: Record<string, string>;
|
headers?: Record<string, string>;
|
||||||
|
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type ExtractorOptions = {
|
export type ExtractorOptions = {
|
||||||
|
@ -303,9 +303,10 @@ export class WebScraperDataProvider {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private applyPathReplacements(documents: Document[]): Document[] {
|
private applyPathReplacements(documents: Document[]): Document[] {
|
||||||
return this.replaceAllPathsWithAbsolutePaths
|
if (this.replaceAllPathsWithAbsolutePaths) {
|
||||||
? replacePathsWithAbsolutePaths(documents)
|
documents = replacePathsWithAbsolutePaths(documents);
|
||||||
: replaceImgPathsWithAbsolutePaths(documents);
|
}
|
||||||
|
return replaceImgPathsWithAbsolutePaths(documents);
|
||||||
}
|
}
|
||||||
|
|
||||||
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
private async applyImgAltText(documents: Document[]): Promise<Document[]> {
|
||||||
@ -474,9 +475,9 @@ export class WebScraperDataProvider {
|
|||||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||||
this.generateImgAltText =
|
this.generateImgAltText =
|
||||||
options.crawlerOptions?.generateImgAltText ?? false;
|
options.crawlerOptions?.generateImgAltText ?? false;
|
||||||
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false };
|
this.pageOptions = options.pageOptions ?? { onlyMainContent: false, includeHtml: false, replaceAllPathsWithAbsolutePaths: false };
|
||||||
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
this.extractorOptions = options.extractorOptions ?? {mode: "markdown"}
|
||||||
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
this.replaceAllPathsWithAbsolutePaths = options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false;
|
||||||
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
//! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check
|
||||||
this.excludes = this.excludes.filter((item) => item !== "");
|
this.excludes = this.excludes.filter((item) => item !== "");
|
||||||
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
this.crawlerMode = options.crawlerOptions?.mode ?? "default";
|
||||||
|
@ -6,12 +6,14 @@ describe('replacePaths', () => {
|
|||||||
it('should replace relative paths with absolute paths', () => {
|
it('should replace relative paths with absolute paths', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is a [link](/path/to/resource) and an image ![alt text](/path/to/image.jpg).'
|
content: 'This is a [link](/path/to/resource).',
|
||||||
|
markdown: 'This is a [link](/path/to/resource).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is a [link](https://example.com/path/to/resource) and an image ![alt text](https://example.com/path/to/image.jpg).'
|
content: 'This is a [link](https://example.com/path/to/resource).',
|
||||||
|
markdown: 'This is a [link](https://example.com/path/to/resource).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
@ -21,7 +23,8 @@ describe('replacePaths', () => {
|
|||||||
it('should not alter absolute URLs', () => {
|
it('should not alter absolute URLs', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is an [external link](https://external.com/path) and an image ![alt text](https://example.com/path/to/image.jpg).'
|
content: 'This is an [external link](https://external.com/path).',
|
||||||
|
markdown: 'This is an [external link](https://external.com/path).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
@ -31,7 +34,8 @@ describe('replacePaths', () => {
|
|||||||
it('should not alter data URLs for images', () => {
|
it('should not alter data URLs for images', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
|
content: 'This is an image: ![alt text](data:image/png;base64,ABC123==).',
|
||||||
|
markdown: 'This is an image: ![alt text](data:image/png;base64,ABC123==).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
@ -41,12 +45,14 @@ describe('replacePaths', () => {
|
|||||||
it('should handle multiple links and images correctly', () => {
|
it('should handle multiple links and images correctly', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here are two links: [link1](/path1) and [link2](/path2), and two images: ![img1](/img1.jpg) ![img2](/img2.jpg).'
|
content: 'Here are two links: [link1](/path1) and [link2](/path2).',
|
||||||
|
markdown: 'Here are two links: [link1](/path1) and [link2](/path2).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2), and two images: ![img1](https://example.com/img1.jpg) ![img2](https://example.com/img2.jpg).'
|
content: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).',
|
||||||
|
markdown: 'Here are two links: [link1](https://example.com/path1) and [link2](https://example.com/path2).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
@ -56,12 +62,14 @@ describe('replacePaths', () => {
|
|||||||
it('should correctly handle a mix of absolute and relative paths', () => {
|
it('should correctly handle a mix of absolute and relative paths', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
|
content: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
|
||||||
|
markdown: 'Mixed paths: [relative](/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
|
content: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).',
|
||||||
|
markdown: 'Mixed paths: [relative](https://example.com/path), [absolute](https://example.com/path), and [data image](data:image/png;base64,ABC123==).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replacePathsWithAbsolutePaths(documents);
|
const result = replacePathsWithAbsolutePaths(documents);
|
||||||
@ -74,12 +82,14 @@ describe('replacePaths', () => {
|
|||||||
it('should replace relative image paths with absolute paths', () => {
|
it('should replace relative image paths with absolute paths', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
content: 'Here is an image: ![alt text](/path/to/image.jpg).',
|
||||||
|
markdown: 'Here is an image: ![alt text](/path/to/image.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
content: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).',
|
||||||
|
markdown: 'Here is an image: ![alt text](https://example.com/path/to/image.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
@ -89,7 +99,8 @@ describe('replacePaths', () => {
|
|||||||
it('should not alter data:image URLs', () => {
|
it('should not alter data:image URLs', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).'
|
content: 'An image with a data URL: ![alt text](data:image/png;base64,ABC123==).',
|
||||||
|
markdown: 'An image with a data URL: ![alt text](data:image/png;base4,ABC123==).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
@ -99,12 +110,14 @@ describe('replacePaths', () => {
|
|||||||
it('should handle multiple images with a mix of data and relative URLs', () => {
|
it('should handle multiple images with a mix of data and relative URLs', () => {
|
||||||
const documents: Document[] = [{
|
const documents: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
|
content: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).',
|
||||||
|
markdown: 'Multiple images: ![img1](/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](/img3.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const expectedDocuments: Document[] = [{
|
const expectedDocuments: Document[] = [{
|
||||||
metadata: { sourceURL: 'https://example.com' },
|
metadata: { sourceURL: 'https://example.com' },
|
||||||
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
|
content: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).',
|
||||||
|
markdown: 'Multiple images: ![img1](https://example.com/img1.jpg) ![img2](data:image/png;base64,ABC123==) ![img3](https://example.com/img3.jpg).'
|
||||||
}];
|
}];
|
||||||
|
|
||||||
const result = replaceImgPathsWithAbsolutePaths(documents);
|
const result = replaceImgPathsWithAbsolutePaths(documents);
|
||||||
|
@ -10,6 +10,7 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||||||
) || [];
|
) || [];
|
||||||
|
|
||||||
paths.forEach((path: string) => {
|
paths.forEach((path: string) => {
|
||||||
|
try {
|
||||||
const isImage = path.startsWith("!");
|
const isImage = path.startsWith("!");
|
||||||
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
let matchedUrl = path.match(/\(([^)]+)\)/) || path.match(/href="([^"]+)"/);
|
||||||
let url = matchedUrl[1];
|
let url = matchedUrl[1];
|
||||||
@ -22,18 +23,18 @@ export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[]
|
|||||||
}
|
}
|
||||||
|
|
||||||
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0];
|
||||||
if (isImage) {
|
// Image is handled afterwards
|
||||||
document.content = document.content.replace(
|
if (!isImage) {
|
||||||
path,
|
|
||||||
`${markdownLinkOrImageText}(${url})`
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
document.content = document.content.replace(
|
document.content = document.content.replace(
|
||||||
path,
|
path,
|
||||||
`${markdownLinkOrImageText}(${url})`
|
`${markdownLinkOrImageText}(${url})`
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
document.markdown = document.content;
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
@ -60,8 +61,10 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||||||
if (!imageUrl.startsWith("http")) {
|
if (!imageUrl.startsWith("http")) {
|
||||||
if (imageUrl.startsWith("/")) {
|
if (imageUrl.startsWith("/")) {
|
||||||
imageUrl = imageUrl.substring(1);
|
imageUrl = imageUrl.substring(1);
|
||||||
}
|
|
||||||
imageUrl = new URL(imageUrl, baseUrl).toString();
|
imageUrl = new URL(imageUrl, baseUrl).toString();
|
||||||
|
} else {
|
||||||
|
imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -70,6 +73,7 @@ export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Documen
|
|||||||
`![${altText}](${imageUrl})`
|
`![${altText}](${imageUrl})`
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
document.markdown = document.content;
|
||||||
});
|
});
|
||||||
|
|
||||||
return documents;
|
return documents;
|
||||||
|
@ -3,15 +3,17 @@ import { supabase_service } from "./supabase";
|
|||||||
export const callWebhook = async (teamId: string, data: any) => {
|
export const callWebhook = async (teamId: string, data: any) => {
|
||||||
try {
|
try {
|
||||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL;
|
||||||
|
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||||
let webhookUrl = selfHostedUrl;
|
let webhookUrl = selfHostedUrl;
|
||||||
|
|
||||||
if (!selfHostedUrl) {
|
// Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
|
||||||
|
// and the USE_DB_AUTHENTICATION environment variable is set to true
|
||||||
|
if (!selfHostedUrl && useDbAuthentication) {
|
||||||
const { data: webhooksData, error } = await supabase_service
|
const { data: webhooksData, error } = await supabase_service
|
||||||
.from("webhooks")
|
.from("webhooks")
|
||||||
.select("url")
|
.select("url")
|
||||||
.eq("team_id", teamId)
|
.eq("team_id", teamId)
|
||||||
.limit(1);
|
.limit(1);
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
console.error(
|
console.error(
|
||||||
`Error fetching webhook URL for team ID: ${teamId}`,
|
`Error fetching webhook URL for team ID: ${teamId}`,
|
||||||
|
Loading…
Reference in New Issue
Block a user