New tables have been introduced in the HTTP Archive dataset, which are more efficient and easier to use. The all
dataset contains all the data from the previous pages
, requests
, and other datasets. This guide will help you migrate your queries to the new dataset.
Migrating to all.pages
Page data schemas comparison
previously | all.pages |
---|
date in a table name | date |
client as _TABLE_SUFFIX | client |
url in pages.YYYY_MM_DD_client | page |
not available | is_root_page |
not available | root_page |
not available | rank |
$.testID within payload column in pages.YYYY_MM_DD_client , wptid column in summary_pages.YYYY_MM_DD_client | wptid |
payload in pages.YYYY_MM_DD_client | payload |
req* , resp* and other in summary_pages.YYYY_MM_DD_client | summary |
$.CUSTOM_METRIC_NAME within payload column in pages.YYYY_MM_DD_client | custom_metrics |
report in lighthouse.YYYY_MM_DD_client | lighthouse |
feature , type , id in blink_features.features | feature , type , id in features |
category , app , info in technologies.YYYY_MM_DD_client | categories , technology , info in technologies |
not available | metadata |
Page query updates
- Migrate from
blink_features.features
SELECT
url,
feature,
type,
id
FROM `httparchive.blink_features.features`
WHERE yyyymmdd = DATE('2024-05-01')
AND client = 'desktop'
SELECT
page,
features.feature,
features.type,
features.id
FROM `httparchive.all.pages`,
UNNEST (features) AS features
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
- Migrate from
lighthouse.YYYY_MM_DD_client
SELECT
url,
JSON_QUERY(report, '$.audits.largest-contentful-paint.numericValue') AS LCP,
FROM `httparchive.lighthouse.2024_06_01_desktop`
/* This query will process 17 TB when run. */
SELECT
page,
JSON_QUERY(lighthouse, '$.audits.largest-contentful-paint.numericValue') AS LCP,
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
- Migrate from
pages.YYYY_MM_DD_client
SELECT
url,
_TABLE_SUFFIX AS client,
JSON_QUERY(payload, '$.testID') AS testID,
-- JSON with the results of the custom metrics,
JSON_QUERY(payload, '$._privacy') AS custom_metrics,
FROM `httparchive.pages.2022_06_01_*`
SELECT
page,
client,
wptid,
-- JSON with the results of the custom metrics,
JSON_QUERY(custom_metrics, '$.privacy') AS custom_metrics,
FROM `httparchive.all.pages`
WHERE date = '2022-06-01'
AND is_root_page
- Migrate from
summary_pages.YYYY_MM_DD_client
SELECT
numDomains,
COUNT(0) pages,
ROUND(AVG(reqTotal),2) avg_requests,
FROM `httparchive.summary_pages.2024_06_01_desktop`
GROUP BY
numDomains
HAVING
pages > 1000
ORDER BY
numDomains ASC
/* This query will process 110 GB when run. */
SELECT
CAST(JSON_VALUE(summary, '$.numDomains') AS INT64) AS numDomains,
COUNT(0) pages,
ROUND(AVG(CAST(JSON_VALUE(summary, '$.reqTotal') AS INT64)),2) as avg_requests,
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
GROUP BY
numDomains
HAVING
pages > 1000
ORDER BY
numDomains ASC
- Migrate from
technologies.YYYY_MM_DD_client
SELECT
url,
category,
app,
info
FROM `httparchive.technologies.2024_06_01_desktop`
/* This query will process 27 GB when run. */
SELECT
page,
technologies.categories,
technologies.technology,
technologies.info
FROM `httparchive.all.pages`,
UNNEST (technologies) AS technologies
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
Migrating to all.requests
Request data schemas comparison
previously | all.requests |
---|
date in a table name | date |
client as _TABLE_SUFFIX | client |
page in requests.YYYY_MM_DD_client | page |
not available | is_root_page |
not available | root_page |
url in requests.YYYY_MM_DD_client | url |
firstHtml in summary_requests.YYYY_MM_DD_client | is_main_document |
type in summary_requests.YYYY_MM_DD_client | type |
$._index within payload in requests.YYYY_MM_DD_client | index |
payload column in requests.YYYY_MM_DD_client | payload |
req* , resp* and other in summary_requests.YYYY_MM_DD_client | summary |
req_* and reqOtherHeaders in almanac.requests | request_headers |
resp_* and respOtherHeaders in almanac.requests | response_headers |
body in response_bodies.YYYY_MM_DD_client | response_body |
Request query updates
- Migrate from
almanac.requests
SELECT
LOWER(JSON_VALUE(request_headers, '$.name')) AS header_name,
JSON_VALUE(request_headers, '$.value') AS header_value,
FROM `httparchive.almanac.requests`,
UNNEST(JSON_QUERY_ARRAY(request_headers)) AS request_headers
WHERE date = '2024-06-01'
AND client = 'desktop'
AND firstHtml
SELECT
LOWER(request_headers.name) AS header_name,
request_headers.value AS header_value,
FROM `httparchive.all.requests`,
UNNEST(request_headers) AS request_headers
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_main_document
AND is_root_page
- Migrate from
requests.YYYY_MM_DD_client
SELECT
page,
url,
JSON_VALUE(payload, '$.response.content.mimeType') AS mimeType,
CAST(JSON_VALUE(payload, '$.response.bodySize') AS INT64) AS respBodySize,
FROM `httparchive.requests.2024_06_01_desktop`
SELECT
page,
url,
JSON_VALUE(summary, '$.mimeType') AS mimeType,
CAST(JSON_VALUE(summary, '$.respBodySize') AS INT64) AS respBodySize,
FROM `httparchive.all.requests`
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
- Migrate from
response_bodies.YYYY_MM_DD_client
SELECT
page,
url,
BYTE_LENGTH(response_body) AS bodySize
FROM `httparchive.response_bodies.2024_06_01_desktop`
/* This query will process 174 TB when run. */
SELECT
page,
url,
BYTE_LENGTH(response_body) AS bodySize
FROM `httparchive.all.requests`
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
- Migrate from
summary_requests.YYYY_MM_DD_client
SELECT
ROUND(respBodySize/1024/100)*100 AS responseSize100KB,
COUNT(0) requests,
FROM `httparchive.summary_requests.2024_06_01_desktop`
GROUP BY responseSize100KB
HAVING responseSize100KB > 0
ORDER BY responseSize100KB ASC
/* This query will process 10 TB when run. */
SELECT
ROUND(CAST(JSON_VALUE(summary, '$.respBodySize') AS INT64)/1024/100)*100 AS responseSize100KB,
COUNT(0) requests,
FROM `httparchive.all.requests`
WHERE date = '2024-06-01'
AND client = 'desktop'
AND is_root_page
GROUP BY responseSize100KB
HAVING responseSize100KB > 0
ORDER BY responseSize100KB ASC