Skip to content

GET_ORIGIN function

The httparchive.fn.GET_ORIGIN function returns the origin for a given URL.

Input

url

The URL of a web page.

Type: STRING

Output

The corresponding origin.

Type: STRING

Example usage

Basic usage

SELECT
url,
`httparchive.fn.GET_ORIGIN`(url) AS origin
FROM UNNEST([
'https://www.example.com/product/123',
'https://example.com/',
'http://example.com:80/index.html'
]) AS url

Counting cross-origin resources per page

WITH cross_origin AS (
SELECT
COUNT(0) AS resources
FROM `httparchive.crawl.requests`
WHERE
date = '2023-11-01' AND
client = 'mobile' AND
is_root_page AND
`httparchive.fn.GET_ORIGIN`(url) != `httparchive.fn.GET_ORIGIN`(page)
GROUP BY page
)
SELECT
APPROX_QUANTILES(resources, 1000)[OFFSET(500)] AS median_xo_resources_per_page
FROM
cross_origin

Routine

LOWER(CONCAT(
-- only network protocols (excludes blob, filesystem, chrome, etc)
REGEXP_EXTRACT(url, r'(?i)^(https?://)'),
NET.HOST(url),
-- be lazy and include @ and : for username/password without enforcing order.
IFNULL(REGEXP_EXTRACT(url, r'(?i)^https?://[\w-.@:]+(:\d+)'), '')
))