Migrate queries to `crawl` dataset
New tables have been introduced in the HTTP Archive dataset, which are more efficient and easier to use. The crawl
dataset contains all the data from the previous pages
, requests
, and other datasets. This guide will help you migrate your queries to the new dataset.
Migrating to crawl.pages
Page data schemas comparison
previously | crawl.pages |
---|---|
date in a table name | date |
client as _TABLE_SUFFIX | client |
url in pages.YYYY_MM_DD_client | page |
not available | is_root_page |
not available | root_page |
not available | rank |
$.testID within payload column in pages.YYYY_MM_DD_client , wptid column in summary_pages.YYYY_MM_DD_client | wptid |
payload in pages.YYYY_MM_DD_client | payload |
req* , resp* and other in summary_pages.YYYY_MM_DD_client | summary |
$.CUSTOM_METRIC_NAME within payload column in pages.YYYY_MM_DD_client | custom_metrics |
report in lighthouse.YYYY_MM_DD_client | lighthouse |
feature , type , id in blink_features.features | feature , type , id in features |
category , app , info in technologies.YYYY_MM_DD_client | categories , technology , info in technologies |
not available | metadata |
Page query updates
- Migrate custom metrics
/* This query will process 115 GB when run. */SELECT client, STRING(custom_metrics.performance.lcp_elem_stats.nodeName) AS lcp_elem_node_name, AVG(INT64(custom_metrics.performance.lcp_elem_stats.size)) AS lcp_elem_node_sizeFROM `httparchive.crawl.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, lcp_elem_node_nameORDER BY client, lcp_elem_node_size DESC
/* This query will process 6.44 TB when run. */SELECT _TABLE_SUFFIX AS client, JSON_VALUE(JSON_EXTRACT(payload, '$._performance'), '$.lcp_elem_stats.nodeName') AS lcp_elem_node_name, AVG(CAST(JSON_VALUE(JSON_EXTRACT(payload, '$._performance'), '$.lcp_elem_stats.size') AS INT64)) AS lcp_elem_node_sizeFROM `httparchive.pages.2024_10_01_*`GROUP BY client, lcp_elem_node_nameORDER BY client, lcp_elem_node_size DESC
/* This query will process 9 TB when run. */SELECT client, CAST(JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.nodeName') AS STRING) AS lcp_elem_node_name, AVG(CAST(JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.size') AS INT64)) AS lcp_elem_node_sizeFROM `httparchive.all.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, lcp_elem_node_nameORDER BY client, lcp_elem_node_size DESC
- Migrate summary metrics queries
/* This query will process 34 GB when run. */SELECT client, INT64(summary.numDomains) AS numDomains, COUNT(0) pages, AVG(INT64(summary.reqTotal)) AS avg_requestsFROM `httparchive.crawl.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, numDomainsHAVING pages > 1000ORDER BY numDomains ASC
/* This query will process 440 MB when run. */SELECT _TABLE_SUFFIX AS client, numDomains, COUNT(0) pages, AVG(reqTotal) AS avg_requestsFROM `httparchive.summary_pages.2024_06_01_*`GROUP BY client, numDomainsHAVING pages > 1000ORDER BY numDomains ASC
/* This query will process 110 GB when run. */SELECT client, CAST(JSON_VALUE(summary, '$.numDomains') AS INT64) AS numDomains, COUNT(0) pages, AVG(CAST(JSON_VALUE(summary, '$.reqTotal') AS INT64)) AS avg_requestsFROM `httparchive.all.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, numDomainsHAVING pages > 1000ORDER BY numDomains ASC
- Migrate detected technologies metrics
/* This query will process 7.18 GB when run. */SELECT page, technologies.categories, technologies.technology, technologies.infoFROM `httparchive.crawl.pages`, UNNEST (technologies) AS technologiesWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 14 GB when run. */SELECT url, category, app, infoFROM `httparchive.technologies.2024_10_01_desktop`
/* This query will process 7.18 GB when run. */SELECT page, technologies.categories, technologies.technology, technologies.infoFROM `httparchive.all.pages`, UNNEST (technologies) AS technologiesWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
- Migrate lighthouse insights
/* This query will process 4.2 TB when run. */SELECT page, lighthouse.audits.`largest-contentful-paint`.numericValue AS LCPFROM `httparchive.crawl.pages`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 4.23 TB when run. */SELECT url, JSON_QUERY(report, '$.audits.largest-contentful-paint.numericValue') AS LCPFROM `httparchive.lighthouse.2024_06_01_desktop`
/* This query will process 18 TB when run. */SELECT page, JSON_QUERY(lighthouse, '$.audits.largest-contentful-paint.numericValue') AS LCPFROM `httparchive.all.pages`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
- Migrate Blink features metrics
/* This query will process 114 GB when run. */SELECT page, features.feature, features.type, features.idFROM `httparchive.crawl.pages`,UNNEST (features) AS featuresWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 548 GB when run. */SELECT url, feature, type, idFROM `httparchive.blink_features.features`WHERE yyyymmdd = DATE('2024-05-01') AND client = 'desktop'
/* This query will process 478 GB when run. */SELECT page, features.feature, features.type, features.idFROM `httparchive.all.pages`, UNNEST (features) AS featuresWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
Migrating to crawl.requests
Request data schemas comparison
previously | crawl.requests |
---|---|
date in a table name | date |
client as _TABLE_SUFFIX | client |
page in requests.YYYY_MM_DD_client | page |
not available | is_root_page |
not available | root_page |
url in requests.YYYY_MM_DD_client | url |
firstHtml in summary_requests.YYYY_MM_DD_client | is_main_document |
type in summary_requests.YYYY_MM_DD_client | type |
$._index within payload in requests.YYYY_MM_DD_client | index |
payload column in requests.YYYY_MM_DD_client | payload |
req* , resp* and other in summary_requests.YYYY_MM_DD_client | summary |
req_* and reqOtherHeaders in summary_requests.YYYY_MM_DD_client | request_headers |
resp_* and respOtherHeaders in summary_requests.YYYY_MM_DD_client | response_headers |
body in response_bodies.YYYY_MM_DD_client | response_body |
Request query updates
- Migrate headers metrics
/* This query will process 169 GB when run. */SELECT response_header.value AS header_value,FROM `httparchive.crawl.requests`, UNNEST(response_headers) AS response_headerWHERE date = '2024-10-01' AND client = 'desktop' AND is_main_document AND is_root_page AND LOWER(response_header.name) = 'content-type'
/* This query will process 22.5 GB when run. */SELECT resp_content_typeFROM `httparchive.summary_requests.2024_10_01_desktop`, UNNEST(JSON_QUERY_ARRAY(request_headers)) AS request_headersWHERE firstHtml
/* This query will process 10 GB when run. */SELECT response_header.value AS header_value,FROM `httparchive.all.requests`, UNNEST(response_headers) AS response_headerWHERE date = '2024-10-01' AND client = 'desktop' AND is_main_document AND is_root_page AND LOWER(response_header.name) = 'content-type'
- Migrate summary metrics
/* This query will process 376 GB when run. */SELECT page, url, STRING(summary.mimeType) AS mimeType, INT64(summary.respBodySize) AS respBodySize,FROM `httparchive.crawl.requests`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 193 GB when run. */SELECT page, url, mimeType, respBodySizeFROM `httparchive.summary_requests.2024_10_01_desktop`
/* This query will process 10.8 TB when run. */SELECT page, url, JSON_VALUE(payload, '$.response.content.mimeType') AS mimeType, CAST(JSON_VALUE(payload, '$.response.bodySize') AS INT64) AS respBodySizeFROM `httparchive.requests.2024_10_01_desktop`
/* This query will process 11.19 TB when run. */SELECT page, url, JSON_VALUE(summary, '$.mimeType') AS mimeType, CAST(JSON_VALUE(summary, '$.respBodySize') AS INT64) AS respBodySizeFROM `httparchive.all.requests`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
- Migrate response body queries
/* This query will process 42.8 TB when run. */SELECT page, url, BYTE_LENGTH(response_body) AS bodySizeFROM `httparchive.crawl.requests`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 40.7 TB when run. */SELECT page, url, BYTE_LENGTH(response_body) AS bodySizeFROM `httparchive.response_bodies.2024_06_01_desktop`