@@ -247,7 +247,7 @@ function checkUrl()
247
247
}
248
248
var lnk = $ ( this ) . attr ( "href" ) . replace ( new RegExp ( "#(.*)" ) , "" ) ;
249
249
250
- if ( ( lnk = check_link ( lnk ) ) == false ) {
250
+ if ( ( lnk = check_link ( lnk , reqUrl ) ) == false ) {
251
251
processingDOM = ( -- links_found > 0 ) ;
252
252
return ;
253
253
}
@@ -292,17 +292,47 @@ function checkUrl()
292
292
} ) ;
293
293
} ;
294
294
295
-
296
- function check_link ( lnk )
297
- {
298
- if ( lnk . indexOf ( "/" ) == 0 )
299
- lnk = "http://" + scrapeHost + lnk ;
300
-
301
- if ( lnk == undefined || [ "#" , "" ] . indexOf ( lnk ) != - 1 || ( lnk . indexOf ( "http://" + scrapeHost ) != 0 && lnk . indexOf ( "https://" + scrapeHost ) != 0 ) ) {
302
- return false ;
303
- }
304
-
305
- return lnk ;
295
+ /**
296
+ * Check a link from a scraped page to make sure it is valid, and then
297
+ * normalize it to an absolute url.
298
+ *
299
+ * @param {String } link The link scraped from the page
300
+ * @param {String } parent_link The link of the page that "link" was scraped from
301
+ *
302
+ * @return {String } The revised link
303
+ */
304
+ function check_link ( link , parent_link ) {
305
+ // check for "empty" links
306
+ if ( link === undefined ) {
307
+ return false ;
308
+ } else if ( link === '' || link == '#' ) {
309
+ return false ;
310
+ }
311
+ // parse the link
312
+ parts = url . parse ( link ) ;
313
+ // check the scheme
314
+ if ( parts . protocol == 'mailto' || parts . protocol == 'javascript' || parts . protocol == 'ftp' ) {
315
+ // incompatible protocol
316
+ return false ;
317
+ } else if ( parts . protocol == 'http' || parts . protocol == 'https' ) {
318
+ // make sure host is our ___domain
319
+ if ( parts . host != scrapeHost ) {
320
+ return false ;
321
+ }
322
+ } else if ( link . indexOf ( '//' ) === 0 ) {
323
+ // handle schema-less; ensure host is ours
324
+ if ( link . indexOf ( '//' + scrapeHost ) !== 0 ) {
325
+ return false ;
326
+ }
327
+ link = 'http:' + link ;
328
+ } else if ( parts . protocol ) {
329
+ // unknown protocol
330
+ return false ;
331
+ } else {
332
+ // relative link
333
+ link = url . resolve ( parent_link , link ) ;
334
+ }
335
+ return link ;
306
336
}
307
337
308
338
function make_request ( protocol , host , path , depth , callback )
0 commit comments