GET_ORIGIN function

The httparchive.fn.GET_ORIGIN function returns the origin for a given URL.

Input

`url`

The URL of a web page.

Type: STRING

Output

The corresponding origin.

Type: STRING

Example usage

Basic usage

Query
Results

SELECT
  url,
  `httparchive.fn.GET_ORIGIN`(url) AS origin
FROM UNNEST([
    'https://www.example.com/product/123',
    'https://example.com/',
    'http://example.com:80/index.html'
  ]) AS url

url	origin
https://www.example.com/product/123	https://www.example.com
https://example.com/	https://example.com
http://example.com:80/index.html	http://example.com:80

WITH cross_origin AS (
  SELECT
    COUNT(0) AS resources
  FROM `httparchive.crawl.requests`
  WHERE
    date = '2023-11-01' AND
    client = 'mobile' AND
    is_root_page AND
    `httparchive.fn.GET_ORIGIN`(url) != `httparchive.fn.GET_ORIGIN`(page)
  GROUP BY page
)


SELECT
  APPROX_QUANTILES(resources, 1000)[OFFSET(500)] AS median_xo_resources_per_page
FROM
  cross_origin

median_xo_resources_per_page
27

Routine

LOWER(CONCAT(
    -- only network protocols (excludes blob, filesystem, chrome, etc)
    REGEXP_EXTRACT(url, r'(?i)^(https?://)'),
    NET.HOST(url),
    -- be lazy and include @ and : for username/password without enforcing order.
    IFNULL(REGEXP_EXTRACT(url, r'(?i)^https?://[\w-.@:]+(:\d+)'), '')
  ))