diff options
Diffstat (limited to 'lib/UriTools.jsm')
-rw-r--r-- | lib/UriTools.jsm | 405 |
1 files changed, 405 insertions, 0 deletions
diff --git a/lib/UriTools.jsm b/lib/UriTools.jsm new file mode 100644 index 0000000..4971909 --- /dev/null +++ b/lib/UriTools.jsm @@ -0,0 +1,405 @@ +/******************************************************************************* + + ηMatrix - a browser extension to black/white list requests. + Copyright (C) 2014-2019 Raymond Hill + Copyright (C) 2019 Alessio Vanni + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see {http://www.gnu.org/licenses/}. + + Home: https://gitlab.com/vannilla/ematrix + uMatrix Home: https://github.com/gorhill/uMatrix +*/ + +'use strict'; + +Components.utils.import('chrome://ematrix/content/lib/Punycode.jsm'); +Components.utils.import('chrome://ematrix/content/lib/PublicSuffixList.jsm'); + +var EXPORTED_SYMBOLS = ['UriTools']; + +var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/; +var reSchemeFromURI = /^[^:\/?#]+:/; +var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/; +var reOriginFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]+)/; +var reCommonHostnameFromURL = /^https?:\/\/([0-9a-z_][0-9a-z._-]*[0-9a-z])\//; +var rePathFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]*)?([^?#]*)/; +var reMustNormalizeHostname = /[^0-9a-z._-]/; + +// These are to parse authority field, not parsed by above official regex +// IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and +// if it fails, the IPv6 compatible regex istr used. This helps +// peformance by avoiding the use of a too complicated regex first. + +// https://github.com/gorhill/httpswitchboard/issues/211 +// "While a hostname may not contain other characters, such as the +// "underscore character (_), other DNS names may contain the underscore" +var reHostPortFromAuthority = /^(?:[^@]*@)?([^:]*)(:\d*)?$/; +var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i; + +var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i; +var reHostFromAuthority = /^(?:[^@]*@)?([^:]+)(?::\d*)?$/; +var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i; + +// Coarse (but fast) tests +var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/; +var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/; + +var reNetworkScheme = /^(?:https?|wss?|ftps?)\b/; +var reSecureScheme = /^(?:https|wss|ftps)\b/; + +function reset(o) { + o.scheme = ''; + o.hostname = ''; + o._ipv4 = undefined; + o._ipv6 = undefined; + o.port = ''; + o.path = ''; + o.query = ''; + o.fragment = ''; + return o; +} + +function resetAuthority(o) { + o.hostname = ''; + o._ipv4 = undefined; + o._ipv6 = undefined; + o.port = ''; + return o; +} + +function URI() { + this.scheme = '', + this.authority = '', + this.hostname = '', + this._ipv4 = undefined, + this._ipv6 = undefined, + this.port = '', + this.domain = undefined, + this.path = '', + this.query = '', + this.fragment = '', + this.schemeBit = (1 << 0), + this.userBit = (1 << 1), + this.passwordBit = (1 << 2), + this.hostnameBit = (1 << 3), + this.portBit = (1 << 4), + this.pathBit = (1 << 5), + this.queryBit = (1 << 6), + this.fragmentBit = (1 << 7), + this.allBits = (0xFFFF), + this.authorityBit = + (this.userBit | this.passwordBit | this.hostnameBit | this.portBit); + this.normalizeBits = + (this.schemeBit | this.hostnameBit | this.pathBit | this.queryBit); +} + +var cached = new URI(); + +var domainCache = new Map(); +var cacheCountLow = 75; +var cacheCountHigh = 100; +var cacheJunkyard = []; +var junkyardMax = cacheCountHigh - cacheCountLow; + +function DomainCacheEntry(domain) { + this.init(domain); +} + +DomainCacheEntry.prototype.init = function (domain) { + this.domain = domain; + this.tstamp = Date.now(); + return this; +}; + +DomainCacheEntry.prototype.dispose = function () { + this.domain = ''; + if (cacheJunkyard.length < junkyardMax) { + cacheJunkyard.push(this); + } +}; + +var domainCacheEntryFactory = function (domain) { + let entry = cacheJunkyard.pop(); + if (entry) { + return entry.init(domain); + } + return new DomainCacheEntry(domain); +}; + +var domainCacheAdd = function (hostname, domain) { + let entry = domainCache.get(hostname); + + if (entry !== undefined) { + entry.tstamp = Date.now(); + } else { + domainCache.set(hostname, domainCacheEntryFactory(domain)); + if (domainCache.size === cacheCountHigh) { + domainCachePrune(); + } + } + + return domain; +}; + +var domainCacheSort = function (a, b) { + return domainCache.get(b).tstamp - domainCache.get(a).tstamp; +}; + +var domainCachePrune = function () { + let hostnames = + Array.from(domainCache.keys()).sort(domainCacheSort).slice(cacheCountLow); + + for (let i=hostnames.length-1; i>=0; --i) { + domainCache.get(hostnames[i]).dispose(); + domainCache.delete(hostnames[i]); + } +}; + +var domainCacheReset = function () { + domainCache.clear(); +}; + +publicSuffixList.onChanged.addListener(domainCacheReset); + +var UriTools = { + set: function (uri) { + if (uri === undefined) { + return reset(cached); + } + + let matches = reRFC3986.exec(uri); + if (!matches) { + return reset(cached); + } + + cached.scheme = matches[1] !== undefined ? + matches[1].slice(0, -1) : + ''; + cached.authority = matches[2] !== undefined ? + matches[2].slice(2).toLowerCase() : + ''; + cached.path = matches[3] !== undefined ? + matches[3] : + ''; + + // As per RFC3986 + if (cached.authority !== '' && cached.path === '') { + cached.path = '/'; + } + + cached.query = matches[4] !== undefined ? + matches[4].slice(1) : + ''; + cached.fragment = matches[5] !== undefined ? + matches[5].slice(1) : + ''; + + if (reHostFromNakedAuthority.test(cached.authority)) { + cached.hostname = cached.authority; + cached.port = ''; + return cached; + } + + matches = reHostPortFromAuthority.exec(cached.authority); + if (!matches) { + matches = reIPv6PortFromAuthority.exec(cached.authority); + if (!matches) { + return resetAuthority(cached); + } + } + + cached.hostname = matches[1] !== undefined ? + matches[1] : + ''; + + if (cached.hostname.slice(-1) === '.') { + cached.hostname = cached.hostname.slice(0, -1); + } + + cached.port = matches[2] !== undefined ? + matches[2].slice(1) : + ''; + + return cached; + }, + assemble: function (bits) { + if (bits === undefined) { + bits = cached.allBits; + } + + let s = []; + + if (cached.scheme && (bits && cached.schemeBit)) { + s.push(cached.scheme, ':'); + } + if (cached.hostname && (bits & cached.hostnameBit)) { + s.push('//', cached.hostname); + } + if (cached.port && (bits & cached.portBit)) { + s.push(':', cached.port); + } + if (cached.path && (bits & cached.pathBit)) { + s.push(cached.path); + } + if (cached.query && (bits & cached.queryBit)) { + s.push('?', cached.query); + } + if (cached.fragment && (bits & cached.fragmentBit)) { + s.push('#', cached.fragment); + } + + return s.join(''); + }, + isNetworkScheme: function (scheme) { + return reNetworkScheme.test(scheme); + }, + isSecureScheme: function(scheme) { + return reSecureScheme.test(scheme); + }, + originFromURI: function (uri) { + let matches = reOriginFromURI.exec(uri); + return matches !== null ? matches[0].toLowerCase() : ''; + }, + schemeFromURI: function (uri) { + let matches = reSchemeFromURI.exec(uri); + return matches !== null ? matches[0].slice(0, -1).toLowerCase() : ''; + }, + authorityFromURI: function (uri) { + let matches = reAuthorityFromURI.exec(uri); + return matches !== null ? matches[1].slice(1).toLowerCase() : ''; + }, + hostnameFromURI: function (uri) { + let matches = reCommonHostnameFromURL.exec(uri); + if (matches) { + return matches[1]; + } + + matches = reAuthorityFromURI.exec(uri); + if (!matches) { + return ''; + } + + let auth = matches[1].slice(2); + + if (reHostFromNakedAuthority.test(auth)) { + return auth.toLowerCase(); + } + + matches = reHostFromAuthority.exec(auth); + if (!matches) { + matches = reIPv6FromAuthority.exec(auth); + if (!matches) { + return ''; + } + } + + let hostname = matches[1]; + while (hostname.endsWith('.')) { + hostname = hostname.slice(0, -1); + } + + if (reMustNormalizeHostname.test(hostname)) { + Punycode.toASCII(hostname.toLowerCase()); + } + + return hostname; + }, + domainFromHostname: function (hostname) { + let entry = domainCache.get(hostname); + if (entry !== undefined) { + entry.tstamp = Date.now(); + return entry.domain; + } + + if (reIPAddressNaive.test(hostname) == false) { + return domainCacheAdd(hostname, + publicSuffixList.getDomain(hostname)); + } + + return domainCacheAdd(hostname, hostname); + }, + domainFromURI: function (uri) { + if (!uri) { + return ''; + } + return UriTools.domainFromHostname(UriTools.hostnameFromURI(uri)); + }, + domain: function() { + return UriTools.domainFromHostname(cached.hostname); + }, + pathFromURI: function (uri) { + let matches = rePathFromURI.exec(uri); + return matches !== null ? matches[1] : ''; + }, + normalizedURI: function () { + return UriTools.assemble(cached.normalizeBits); + }, + rootURL: function () { + if (!cached.hostname) { + return ''; + } + return UriTools.assemble(cached.scemeBit | cached.hostnameBit); + }, + isValidHostname: function (hostname) { + try { + let r = reValidHostname.test(hostname); + return r; + } catch (e) { + return false; + } + }, + parentHostnameFromHostname: function (hostname) { + // "locahost" => "" + // "example.org" => "example.org" + // "www.example.org" => "example.org" + // "tomato.www.example.org" => "example.org" + let domain = UriTools.domainFromHostname(hostname); + + if (domain === '' || domain === hostname) { + return undefined; + } + + return hostname.slice(hostname.indexOf('.') + 1); + }, + parentHostnamesFromHostname: function (hostname) { + let domain = UriTools.domainFromHostname(hostname); + if (domain === '' || domain === hostname) { + return []; + } + + let nodes = []; + for (;;) { + let pos = hostname.indexOf('.'); + if (pos < 0) { + break; + } + + hostname = hostname.slice(pos+1); + nodes.push(hostname); + if (hostname === domain) { + break; + } + } + + return nodes; + }, + allHostNamesFromHostname: function (hostname) { + let nodes = UriTools.parentHostnamesFromHostname(hostname); + nodes.unshift(hostname); + return nodes; + }, + toString: function () { + return UriTools.assemble(); + }, +}; |