Skip to content

Commit 1a9c6bf

Browse files
committed
Implement URL parsing/unparsing per RFC 3986.
- Section 5.3 Component Recomposition in RFC 3986 makes a differentiation between undefined components and empty components that the built-in parse_url in python does not. This patch deals with that issue and ensures, for instance, that empty queries and fragments are detected.
1 parent fed4091 commit 1a9c6bf

File tree

1 file changed

+81
-76
lines changed

1 file changed

+81
-76
lines changed

jsonld.php

Lines changed: 81 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -506,59 +506,45 @@ function jsonld_parse_url($url) {
506506
$url = '';
507507
}
508508

509-
$rval = parse_url($url);
509+
$keys = array(
510+
'href', 'protocol', 'scheme', '?authority', 'authority',
511+
'?auth', 'auth', 'user', 'pass', 'host', '?port', 'port', 'path',
512+
'?query', 'query', '?fragment', 'fragment');
513+
$regex = "/^(([^:\/?#]+):)?(\/\/(((([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(:(\d*))?))?([^?#]*)(\?([^#]*))?(#(.*))?/";
514+
preg_match($regex, $url, $match);
510515

511-
// malformed url
512-
if($rval === false) {
513-
$rval = array();
516+
$rval = array();
517+
$flags = array();
518+
$len = count($keys);
519+
for($i = 0; $i < $len; ++$i) {
520+
$key = $keys[$i];
521+
if(strpos($key, '?') === 0) {
522+
$flags[substr($key, 1)] = !empty($match[$i]);
523+
} else if(!isset($match[$i])) {
524+
$rval[$key] = null;
525+
} else {
526+
$rval[$key] = $match[$i];
527+
}
514528
}
515529

516-
$rval['href'] = $url;
517-
if(!isset($rval['scheme'])) {
518-
$rval['scheme'] = '';
519-
$rval['protocol'] = '';
520-
} else {
521-
$rval['protocol'] = $rval['scheme'] . ':';
530+
if(!$flags['authority']) {
531+
$rval['authority'] = null;
522532
}
523-
if(!isset($rval['host'])) {
524-
$rval['host'] = '';
533+
if(!$flags['auth']) {
534+
$rval['auth'] = $rval['user'] = $rval['pass'] = null;
525535
}
526-
if(!isset($rval['path'])) {
527-
$rval['path'] = '';
536+
if(!$flags['port']) {
537+
$rval['port'] = null;
528538
}
529-
if(isset($rval['user']) || isset($rval['pass'])) {
530-
$rval['auth'] = '';
531-
if(isset($rval['user'])) {
532-
$rval['auth'] = $rval['user'];
533-
}
534-
if(isset($rval['pass'])) {
535-
$rval['auth'] .= ":{$rval['pass']}";
536-
}
539+
if(!$flags['query']) {
540+
$rval['query'] = null;
537541
}
538-
// parse authority for unparsed relative network-path reference
539-
if(strpos($rval['href'], ':') === false &&
540-
strpos($rval['href'], '//') === 0 && $rval['host'] === '') {
541-
// must parse authority from pathname
542-
$rval['path'] = substr($rval['path'], 2);
543-
$idx = strpos($rval['path'], '/');
544-
if($idx === false) {
545-
$rval['authority'] = $rval['path'];
546-
$rval['path'] = '';
547-
} else {
548-
$rval['authority'] = substr($rval['path'], 0, $idx);
549-
$rval['path'] = substr($rval['path'], $idx);
550-
}
551-
} else {
552-
$rval['authority'] = $rval['host'];
553-
if(isset($rval['port'])) {
554-
$rval['authority'] .= ":{$rval['port']}";
555-
}
556-
if(isset($rval['auth'])) {
557-
$rval['authority'] = "{$rval['auth']}@{$rval['authority']}";
558-
}
542+
if(!$flags['fragment']) {
543+
$rval['fragment'] = null;
559544
}
545+
560546
$rval['normalizedPath'] = jsonld_remove_dot_segments(
561-
$rval['path'], $rval['authority'] !== '');
547+
$rval['path'], !!$rval['authority']);
562548

563549
return $rval;
564550
}
@@ -628,47 +614,66 @@ function jsonld_prepend_base($base, $iri) {
628614
// parse given IRI
629615
$rel = jsonld_parse_url($iri);
630616

631-
// start hierarchical part
632-
$hierPart = $base['protocol'];
633-
if($rel['authority']) {
634-
$hierPart .= "//{$rel['authority']}";
635-
} else if($base['href'] !== '') {
636-
$hierPart .= "//{$base['authority']}";
637-
}
638-
639-
// per RFC3986 normalize
617+
// per RFC3986 5.2.2
618+
$transform = array('protocol' => $base['protocol']);
640619

641-
// IRI represents an absolute path
642-
if(strpos($rel['path'], '/') === 0) {
643-
$path = $rel['path'];
620+
if($rel['authority'] !== null) {
621+
$transform['authority'] = $rel['authority'];
622+
$transform['path'] = $rel['path'];
623+
$transform['query'] = $rel['query'];
644624
} else {
645-
$path = $base['path'];
625+
$transform['authority'] = $base['authority'];
626+
627+
if($rel['path'] === '') {
628+
$transform['path'] = $base['path'];
629+
if($rel['query'] !== null) {
630+
$transform['query'] = $rel['query'];
631+
} else {
632+
$transform['query'] = $base['query'];
633+
}
634+
} else {
635+
if(strpos($rel['path'], '/') === 0) {
636+
// IRI represents an absolute path
637+
$transform['path'] = $rel['path'];
638+
} else {
639+
// merge paths
640+
$path = $base['path'];
641+
642+
// append relative path to the end of the last directory from base
643+
if($rel['path'] !== '') {
644+
$idx = strrpos($path, '/');
645+
$idx = ($idx === false) ? 0 : $idx + 1;
646+
$path = substr($path, 0, $idx);
647+
if(strlen($path) > 0 && substr($path, -1) !== '/') {
648+
$path .= '/';
649+
}
650+
$path .= $rel['path'];
651+
}
646652

647-
// append relative path to the end of the last directory from base
648-
if($rel['path'] !== '') {
649-
$idx = strrpos($path, '/');
650-
$idx = ($idx === false) ? 0 : $idx + 1;
651-
$path = substr($path, 0, $idx);
652-
if(strlen($path) > 0 && substr($path, -1) !== '/') {
653-
$path .= '/';
653+
$transform['path'] = $path;
654654
}
655-
$path .= $rel['path'];
655+
$transform['query'] = $rel['query'];
656656
}
657657
}
658658

659659
// remove slashes and dots in path
660-
$path = jsonld_remove_dot_segments($path, $hierPart !== '');
660+
$transform['path'] = jsonld_remove_dot_segments(
661+
$transform['path'], !!$transform['authority']);
661662

662-
// add query and hash
663-
if(isset($rel['query'])) {
664-
$path .= "?{$rel['query']}";
663+
// construct URL
664+
$rval = $transform['protocol'];
665+
if($transform['authority'] !== null) {
666+
$rval .= '//' . $transform['authority'];
665667
}
666-
if(isset($rel['fragment'])) {
667-
$path .= "#{$rel['fragment']}";
668+
$rval .= $transform['path'];
669+
if($transform['query'] !== null) {
670+
$rval .= '?' . $transform['query'];
671+
}
672+
if($rel['fragment'] !== null) {
673+
$rval .= '#' . $rel['fragment'];
668674
}
669675

670-
$rval = $hierPart . $path;
671-
676+
// handle empty base
672677
if($rval === '') {
673678
$rval = './';
674679
}
@@ -716,7 +721,7 @@ function jsonld_remove_base($base, $iri) {
716721
// is a hash or query)
717722
$base_segments = explode('/', $base['normalizedPath']);
718723
$iri_segments = explode('/', $rel['normalizedPath']);
719-
$last = (isset($rel['query']) || isset($rel['fragment'])) ? 0 : 1;
724+
$last = ($rel['query'] || $rel['fragment']) ? 0 : 1;
720725
while(count($base_segments) > 0 && count($iri_segments) > $last) {
721726
if($base_segments[0] !== $iri_segments[0]) {
722727
break;
@@ -740,10 +745,10 @@ function jsonld_remove_base($base, $iri) {
740745
$rval .= implode('/', $iri_segments);
741746

742747
// add query and hash
743-
if(isset($rel['query'])) {
748+
if($rel['query'] !== null) {
744749
$rval .= "?{$rel['query']}";
745750
}
746-
if(isset($rel['fragment'])) {
751+
if($rel['fragment'] !== null) {
747752
$rval .= "#{$rel['fragment']}";
748753
}
749754

0 commit comments

Comments
 (0)