Skip to content

Commit adb8894

Browse files
author
Evgheni C.
committed
Merge pull request #8 from NetCrafters/hotfix-invalid-links
Better handling for scraping links, i.e. relative urls and whitespace.
2 parents ed8f8ac + 56f79ba commit adb8894

File tree

1 file changed

+42
-12
lines changed

1 file changed

+42
-12
lines changed

crawling-daemon.js

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ function checkUrl()
247247
}
248248
var lnk = $(this).attr("href").replace(new RegExp("#(.*)"), "");
249249

250-
if ((lnk = check_link(lnk)) == false) {
250+
if ((lnk = check_link(lnk, reqUrl)) == false) {
251251
processingDOM = (--links_found > 0);
252252
return;
253253
}
@@ -292,17 +292,47 @@ function checkUrl()
292292
});
293293
};
294294

295-
296-
function check_link(lnk)
297-
{
298-
if( lnk.indexOf("/")==0 )
299-
lnk = "http://" + scrapeHost + lnk;
300-
301-
if( lnk==undefined || ["#", ""].indexOf(lnk)!=-1 || (lnk.indexOf("http://" + scrapeHost)!=0 && lnk.indexOf("https://"+scrapeHost)!=0) ) {
302-
return false;
303-
}
304-
305-
return lnk;
295+
/**
296+
* Check a link from a scraped page to make sure it is valid, and then
297+
* normalize it to an absolute url.
298+
*
299+
* @param {String} link The link scraped from the page
300+
* @param {String} parent_link The link of the page that "link" was scraped from
301+
*
302+
* @return {String} The revised link
303+
*/
304+
function check_link(link, parent_link) {
305+
// check for "empty" links
306+
if (link===undefined) {
307+
return false;
308+
} else if (link==='' || link=='#') {
309+
return false;
310+
}
311+
// parse the link
312+
parts = url.parse(link);
313+
// check the scheme
314+
if (parts.protocol=='mailto' || parts.protocol=='javascript' || parts.protocol=='ftp') {
315+
// incompatible protocol
316+
return false;
317+
} else if (parts.protocol=='http' || parts.protocol=='https') {
318+
// make sure host is our ___domain
319+
if (parts.host!=scrapeHost) {
320+
return false;
321+
}
322+
} else if (link.indexOf('//')===0) {
323+
// handle schema-less; ensure host is ours
324+
if (link.indexOf('//' + scrapeHost)!==0) {
325+
return false;
326+
}
327+
link = 'http:' + link;
328+
} else if (parts.protocol) {
329+
// unknown protocol
330+
return false;
331+
} else {
332+
// relative link
333+
link = url.resolve(parent_link, link);
334+
}
335+
return link;
306336
}
307337

308338
function make_request(protocol, host, path, depth, callback)

0 commit comments

Comments
 (0)