Migrate queries to `crawl` dataset
New tables have been introduced in the HTTP Archive dataset, which are more efficient and easier to use. The crawl
dataset contains all the data from the previous pages
, requests
, and other datasets. This guide will help you migrate your queries to the new dataset.
Migrating to crawl.pages
Section titled “Migrating to crawl.pages”Page data schemas comparison
Section titled “Page data schemas comparison”previously | crawl.pages |
---|---|
date in a table name | date |
client as _TABLE_SUFFIX | client |
url in pages.YYYY_MM_DD_client | page |
not available | is_root_page |
not available | root_page |
not available | rank |
$.testID within payload column in pages.YYYY_MM_DD_client , wptid column in summary_pages.YYYY_MM_DD_client | wptid |
payload in pages.YYYY_MM_DD_client | payload |
req* , resp* and other in summary_pages.YYYY_MM_DD_client | summary |
$.CUSTOM_METRIC_NAME within payload column in pages.YYYY_MM_DD_client | custom_metrics |
report in lighthouse.YYYY_MM_DD_client | lighthouse |
feature , type , id in blink_features.features | feature , type , id in features |
category , app , info in technologies.YYYY_MM_DD_client | categories , technology , info in technologies |
not available | metadata |
Page query updates
Section titled “Page query updates”- Migrate custom metrics
/* This query will process 115 GB when run. */SELECT client, STRING(custom_metrics.performance.lcp_elem_stats.nodeName) AS lcp_elem_node_name, AVG(INT64(custom_metrics.performance.lcp_elem_stats.size)) AS lcp_elem_node_sizeFROM `httparchive.crawl.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, lcp_elem_node_nameORDER BY client, lcp_elem_node_size DESC
/* This query will process 6.44 TB when run. */SELECT _TABLE_SUFFIX AS client, JSON_VALUE(JSON_EXTRACT(payload, '$._performance'), '$.lcp_elem_stats.nodeName') AS lcp_elem_node_name, AVG(CAST(JSON_VALUE(JSON_EXTRACT(payload, '$._performance'), '$.lcp_elem_stats.size') AS INT64)) AS lcp_elem_node_sizeFROM `httparchive.pages.2024_10_01_*`GROUP BY client, lcp_elem_node_nameORDER BY client, lcp_elem_node_size DESC
/* This query will process 9 TB when run. */SELECT client, CAST(JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.nodeName') AS STRING) AS lcp_elem_node_name, AVG(CAST(JSON_VALUE(custom_metrics, '$.performance.lcp_elem_stats.size') AS INT64)) AS lcp_elem_node_sizeFROM `httparchive.all.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, lcp_elem_node_nameORDER BY client, lcp_elem_node_size DESC
- Migrate summary metrics queries
/* This query will process 34 GB when run. */SELECT client, INT64(summary.numDomains) AS numDomains, COUNT(0) pages, AVG(INT64(summary.reqTotal)) AS avg_requestsFROM `httparchive.crawl.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, numDomainsHAVING pages > 1000ORDER BY numDomains ASC
/* This query will process 440 MB when run. */SELECT _TABLE_SUFFIX AS client, numDomains, COUNT(0) pages, AVG(reqTotal) AS avg_requestsFROM `httparchive.summary_pages.2024_06_01_*`GROUP BY client, numDomainsHAVING pages > 1000ORDER BY numDomains ASC
/* This query will process 110 GB when run. */SELECT client, CAST(JSON_VALUE(summary, '$.numDomains') AS INT64) AS numDomains, COUNT(0) pages, AVG(CAST(JSON_VALUE(summary, '$.reqTotal') AS INT64)) AS avg_requestsFROM `httparchive.all.pages`WHERE date = '2024-10-01' AND is_root_pageGROUP BY client, numDomainsHAVING pages > 1000ORDER BY numDomains ASC
- Migrate detected technologies metrics
/* This query will process 7.18 GB when run. */SELECT page, technologies.categories, technologies.technology, technologies.infoFROM `httparchive.crawl.pages`, UNNEST (technologies) AS technologiesWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 14 GB when run. */SELECT url, category, app, infoFROM `httparchive.technologies.2024_10_01_desktop`
/* This query will process 7.18 GB when run. */SELECT page, technologies.categories, technologies.technology, technologies.infoFROM `httparchive.all.pages`, UNNEST (technologies) AS technologiesWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
- Migrate lighthouse insights
/* This query will process 4.2 TB when run. */SELECT page, lighthouse.audits.`largest-contentful-paint`.numericValue AS LCPFROM `httparchive.crawl.pages`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 4.23 TB when run. */SELECT url, JSON_QUERY(report, '$.audits.largest-contentful-paint.numericValue') AS LCPFROM `httparchive.lighthouse.2024_06_01_desktop`
/* This query will process 18 TB when run. */SELECT page, JSON_QUERY(lighthouse, '$.audits.largest-contentful-paint.numericValue') AS LCPFROM `httparchive.all.pages`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
- Migrate Blink features metrics
/* This query will process 114 GB when run. */SELECT page, features.feature, features.type, features.idFROM `httparchive.crawl.pages`,UNNEST (features) AS featuresWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 548 GB when run. */SELECT url, feature, type, idFROM `httparchive.blink_features.features`WHERE yyyymmdd = DATE('2024-05-01') AND client = 'desktop'
/* This query will process 478 GB when run. */SELECT page, features.feature, features.type, features.idFROM `httparchive.all.pages`, UNNEST (features) AS featuresWHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
Migrating to crawl.requests
Section titled “Migrating to crawl.requests”Request data schemas comparison
Section titled “Request data schemas comparison”previously | crawl.requests |
---|---|
date in a table name | date |
client as _TABLE_SUFFIX | client |
page in requests.YYYY_MM_DD_client | page |
not available | is_root_page |
not available | root_page |
url in requests.YYYY_MM_DD_client | url |
firstHtml in summary_requests.YYYY_MM_DD_client | is_main_document |
type in summary_requests.YYYY_MM_DD_client | type |
$._index within payload in requests.YYYY_MM_DD_client | index |
payload column in requests.YYYY_MM_DD_client | payload |
req* , resp* and other in summary_requests.YYYY_MM_DD_client | summary |
req_* and reqOtherHeaders in summary_requests.YYYY_MM_DD_client | request_headers |
resp_* and respOtherHeaders in summary_requests.YYYY_MM_DD_client | response_headers |
body in response_bodies.YYYY_MM_DD_client | response_body |
Request query updates
Section titled “Request query updates”- Migrate headers metrics
/* This query will process 169 GB when run. */SELECT response_header.value AS header_value,FROM `httparchive.crawl.requests`, UNNEST(response_headers) AS response_headerWHERE date = '2024-10-01' AND client = 'desktop' AND is_main_document AND is_root_page AND LOWER(response_header.name) = 'content-type'
/* This query will process 22.5 GB when run. */SELECT resp_content_typeFROM `httparchive.summary_requests.2024_10_01_desktop`, UNNEST(JSON_QUERY_ARRAY(request_headers)) AS request_headersWHERE firstHtml
/* This query will process 10 GB when run. */SELECT response_header.value AS header_value,FROM `httparchive.all.requests`, UNNEST(response_headers) AS response_headerWHERE date = '2024-10-01' AND client = 'desktop' AND is_main_document AND is_root_page AND LOWER(response_header.name) = 'content-type'
- Migrate summary metrics
/* This query will process 376 GB when run. */SELECT page, url, STRING(summary.mimeType) AS mimeType, INT64(summary.respBodySize) AS respBodySize,FROM `httparchive.crawl.requests`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 193 GB when run. */SELECT page, url, mimeType, respBodySizeFROM `httparchive.summary_requests.2024_10_01_desktop`
/* This query will process 10.8 TB when run. */SELECT page, url, JSON_VALUE(payload, '$.response.content.mimeType') AS mimeType, CAST(JSON_VALUE(payload, '$.response.bodySize') AS INT64) AS respBodySizeFROM `httparchive.requests.2024_10_01_desktop`
/* This query will process 11.19 TB when run. */SELECT page, url, JSON_VALUE(summary, '$.mimeType') AS mimeType, CAST(JSON_VALUE(summary, '$.respBodySize') AS INT64) AS respBodySizeFROM `httparchive.all.requests`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
- Migrate response body queries
/* This query will process 42.8 TB when run. */SELECT page, url, BYTE_LENGTH(response_body) AS bodySizeFROM `httparchive.crawl.requests`WHERE date = '2024-10-01' AND client = 'desktop' AND is_root_page
/* This query will process 40.7 TB when run. */SELECT page, url, BYTE_LENGTH(response_body) AS bodySizeFROM `httparchive.response_bodies.2024_06_01_desktop`