Skip to content

Commit d404130

Browse files
committed
Merge branch 'idc-test' of https://github.com/ImagingDataCommons/IDC-WebApp into idc-prod
2 parents 6b7fe01 + 06232ba commit d404130

File tree

8 files changed

+423
-348
lines changed

8 files changed

+423
-348
lines changed

etl/sql/collection_metadata.sql

Lines changed: 85 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,90 @@
1-
SELECT
2-
active.idc_webapp_collection_id AS collection_id,
3-
orig_id_map.idc_collection_id AS collection_uuid,
4-
active.collection_name AS name,
5-
NULL AS collections,
6-
ImageTypes AS image_types,
7-
SupportingData AS supporting_data,
8-
Subjects AS subject_count,
9-
DOI AS doi,
10-
URL AS source_url,
11-
CancerType AS cancer_type,
12-
Species AS species,
13-
Location AS location,
14-
NULL AS analysis_artifacts,
15-
Description AS description,
16-
"O" AS collection_type,
17-
REGEXP_REPLACE(active.Program," ","_") AS program,
18-
ARRAY_TO_STRING(active.Access,"; ") AS access,
19-
{etl_date} AS date_updated,
20-
tcia_wiki_collection_id,
21-
"True" AS active
22-
FROM `idc-dev-etl.{dataset}_pub.original_collections_metadata` AS active
23-
JOIN `idc-dev-etl.{dataset}_dev.collection_id_map` AS orig_id_map
24-
ON orig_id_map.idc_webapp_collection_id = active.idc_webapp_collection_id
1+
SELECT
2+
active.collection_id AS collection_id,
3+
STRING_AGG(DISTINCT(orig_id_map.idc_collection_id)) AS collection_uuid,
4+
active.collection_name AS name,
5+
NULL AS collections,
6+
STRING_AGG(DISTINCT(ImageTypes), ', ') AS image_types,
7+
STRING_AGG(DISTINCT(SupportingData), ', ') AS supporting_data,
8+
MAX(Subjects) AS subject_count,
9+
STRING_AGG(DISTINCT(source_doi), ' ') AS doi,
10+
STRING_AGG(DISTINCT(source_url), ' ') AS source_url,
11+
STRING_AGG(DISTINCT(CancerTypes), ', ') AS cancer_type,
12+
STRING_AGG(DISTINCT(Species), ', ') AS species,
13+
STRING_AGG(DISTINCT(TumorLocations), ', ') AS location,
14+
NULL AS analysis_artifacts,
15+
STRING_AGG(DISTINCT(descs.Description), ', ') AS description,
16+
"O" AS collection_type,
17+
STRING_AGG(DISTINCT(REGEXP_REPLACE(active.Program," ","_")), ', ') AS program,
18+
STRING_AGG(DISTINCT(Access), ', ') AS access,
19+
CURRENT_DATE() AS date_updated,
20+
STRING_AGG(DISTINCT(active.collection_name), ', ') AS tcia_wiki_collection_id,
21+
STRING_AGG(DISTINCT(license.license_short_name), ', ') AS license_short_name,
22+
"True" AS active
23+
FROM `idc-dev-etl.{data_version}_pub.original_collections_metadata` AS active,
24+
UNNEST(active.Sources) AS s
25+
JOIN `idc-dev-etl.{data_version}_dev.collection_id_map` AS orig_id_map
26+
ON orig_id_map.idc_webapp_collection_id = active.collection_id
27+
JOIN `idc-dev-etl.{data_version}_dev.original_collections_descriptions` AS descs
28+
ON descs.collection_id = active.collection_id
29+
GROUP BY collection_id, name
2530
UNION ALL
26-
SELECT
27-
inactive.idc_webapp_collection_id AS collection_id,
28-
orig_id_map.idc_collection_id AS collection_uuid,
29-
CASE
30-
WHEN inactive.tcia_api_collection_id = '' OR inactive.tcia_api_collection_id IS NULL
31-
THEN REPLACE(UPPER(inactive.idc_webapp_collection_id),"_","-")
32-
ELSE inactive.tcia_api_collection_id
33-
END AS name,
34-
NULL AS collections,
35-
ImageTypes AS image_types,
36-
SupportingData AS supporting_data,
37-
Subjects AS subject_count,
38-
DOI AS doi,
39-
URL AS source_url,
40-
CancerType AS cancer_type,
41-
Species AS species,
42-
Location AS location,
43-
NULL AS analysis_artifacts,
44-
Description AS description,
45-
"O" AS collection_type,
46-
REGEXP_REPLACE(inactive.Program," ","_") AS program,
47-
ARRAY_TO_STRING(inactive.Access,"; ") AS access,
48-
{etl_date} AS date_updated,
49-
tcia_wiki_collection_id,
50-
"False" AS active
51-
FROM `idc-dev-etl.idc_v12_dev.excluded_collections_metadata` AS inactive
52-
JOIN `idc-dev-etl.{dataset}_dev.collection_id_map` AS orig_id_map
53-
ON orig_id_map.idc_webapp_collection_id = inactive.idc_webapp_collection_id
31+
SELECT
32+
inactive.idc_webapp_collection_id AS collection_id,
33+
orig_id_map.idc_collection_id AS collection_uuid,
34+
CASE WHEN inactive.tcia_api_collection_id = '' OR inactive.tcia_api_collection_id IS NULL
35+
THEN REPLACE(UPPER(inactive.idc_webapp_collection_id),"_","-")
36+
ELSE inactive.tcia_api_collection_id
37+
END AS name,
38+
NULL AS collections,
39+
ImageTypes AS image_types,
40+
SupportingData AS supporting_data,
41+
Subjects AS subject_count,
42+
DOI AS doi,
43+
URL AS source_url,
44+
CancerType AS cancer_type,
45+
Species AS species,
46+
Location AS location,
47+
NULL AS analysis_artifacts,
48+
Description AS description,
49+
"O" AS collection_type,
50+
REGEXP_REPLACE(inactive.Program," ","_") AS program,
51+
ARRAY_TO_STRING(inactive.Access,"; ") AS access,
52+
CURRENT_DATE() AS date_updated,
53+
tcia_wiki_collection_id,
54+
NULL AS license_short_name,
55+
"False" AS active
56+
FROM `idc-dev-etl.{data_version}_dev.excluded_collections` AS excluded
57+
JOIN `idc-dev-etl.{data_version}_dev.collection_id_map` AS orig_id_map
58+
ON orig_id_map.idc_collection_id = excluded.idc_collection_id
59+
LEFT JOIN `idc-dev-etl.idc_v12_dev.excluded_collections_metadata` AS inactive
60+
ON inactive.idc_webapp_collection_id = orig_id_map.idc_webapp_collection_id
5461
UNION ALL
55-
SELECT
56-
ID AS collection_id,
57-
analysis_id_map.idc_id AS collection_uuid,
58-
Title AS name,
59-
Collections AS collections,
60-
NULL AS image_types,
61-
NULL AS supporting_data,
62-
Subjects AS subject_count,
63-
DOI AS doi,
64-
NULL AS source_url,
65-
CancerType AS cancer_type,
66-
NULL AS species,
67-
Location AS location,
68-
AnalysisArtifactsonTCIA AS analysis_artifacts,
69-
Description AS description,
70-
"A" AS collection_type,
71-
NULL AS program,
72-
analysis.Access AS access,
73-
{etl_date} AS date_updated,
74-
NULL AS tcia_wiki_collection_id,
75-
CASE
76-
WHEN ID IS NULL
77-
THEN "False"
78-
ELSE "True"
79-
END AS active
80-
FROM `idc-dev-etl.{dataset}_pub.analysis_results_metadata` analysis
81-
JOIN `idc-dev-etl.{dataset}_dev.analysis_id_map` AS analysis_id_map
62+
SELECT
63+
analysis.ID AS collection_id,
64+
analysis_id_map.idc_id AS collection_uuid,
65+
Title AS name,
66+
Collections AS collections,
67+
NULL AS image_types,
68+
NULL AS supporting_data,
69+
Subjects AS subject_count,
70+
source_doi AS doi,
71+
source_url AS source_url,
72+
CancerTypes AS cancer_type,
73+
NULL AS species,
74+
TumorLocations AS location,
75+
AnalysisArtifacts AS analysis_artifacts,
76+
adescs.Description AS description,
77+
"A" AS collection_type,
78+
NULL AS program,
79+
analysis.Access AS access,
80+
CURRENT_DATE() AS date_updated,
81+
NULL AS tcia_wiki_collection_id,
82+
license_short_name,
83+
CASE WHEN analysis.ID IS NULL THEN "False" ELSE "True" END AS active
84+
FROM `idc-dev-etl.{data_version}_pub.analysis_results_metadata` analysis
85+
JOIN `idc-dev-etl.{data_version}_dev.analysis_id_map` AS analysis_id_map
8286
ON analysis_id_map.collection_id = analysis.ID
87+
JOIN `idc-dev-etl.{data_version}_dev.analysis_results_descriptions` AS adescs
88+
ON adescs.id = analysis.ID
8389
ORDER BY collection_id
8490
;

etl/sql/data_summary.sql

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,43 @@
1-
SELECT
2-
volume.version,
3-
volume.sum AS data_volume,
4-
series.count AS series_count,
5-
patient.count AS patient_count,
6-
collex.count AS collection_count
7-
FROM (
8-
SELECT "{version_number}" AS version, (SUM(instance_size))/(1024*1024*1024*1024) AS sum
9-
FROM `idc-dev-etl.{dataset}_pub.dicom_all`
10-
) volume
11-
JOIN (
12-
SELECT "v15" AS version, COUNT(*) AS count
1+
WITH volume AS (
2+
SELECT "{version_display}" AS version, (SUM(instance_size))/(pow(1000,4)) AS sum
3+
FROM `idc-dev-etl.{data_version}_pub.dicom_all`
4+
),
5+
series AS (
6+
SELECT "{version_display}" AS version, COUNT(*) AS count
137
FROM (
148
SELECT DISTINCT SeriesInstanceUID
15-
FROM `idc-dev-etl.{dataset}_pub.dicom_all`
9+
FROM `idc-dev-etl.{data_version}_pub.dicom_all`
1610
)
1711
GROUP BY version
18-
) series ON
19-
series.version = volume.version
20-
JOIN (
21-
SELECT "{version_number}" AS version, COUNT(*) AS count
12+
),
13+
patient AS (
14+
SELECT "{version_display}" AS version, COUNT(*) AS count
2215
FROM (
2316
SELECT DISTINCT PatientID
24-
FROM `idc-dev-etl.{dataset}_pub.dicom_all`
17+
FROM `idc-dev-etl.{data_version}_pub.dicom_all`
2518
)
2619
GROUP BY version
27-
) patient ON
28-
series.version = patient.version
29-
JOIN (
30-
SELECT "{version_number}" AS version, COUNT(*) AS count
20+
),
21+
collex AS (
22+
SELECT "{version_display}" AS version, COUNT(*) AS count
3123
FROM (
3224
SELECT DISTINCT collection_id
33-
FROM `idc-dev-etl.{dataset}_pub.dicom_derived_all`
25+
FROM `idc-dev-etl.{data_version}_dev.dicom_derived_all`
3426
WHERE collection_id IS NOT NULL AND analysis_results_id IS NULL
3527
)
3628
GROUP BY version
37-
) collex ON
38-
series.version = collex.version
39-
;
29+
)
30+
SELECT
31+
volume.version,
32+
volume.sum AS data_volume,
33+
series.count AS series_count,
34+
patient.count AS patient_count,
35+
collex.count AS collection_count
36+
FROM volume
37+
JOIN series
38+
ON series.version = volume.version
39+
JOIN patient
40+
ON series.version = volume.version
41+
JOIN collex
42+
ON volume.version = collex.version
43+
;

etl/sql/dicom_derived_all.sql

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,51 +17,68 @@ SELECT
1717
dicom.crdc_study_uuid AS crdc_study_uuid,
1818
dicom.crdc_series_uuid AS crdc_series_uuid,
1919
dicom.crdc_instance_uuid AS crdc_instance_uuid,
20-
dicom.tcia_tumorLocation,
20+
dicom.collection_tumorLocation AS tcia_tumorLocation,
2121
dicom.collection_cancerType AS CancerType,
2222
dicom.source_DOI,
23-
dicom.tcia_species,
23+
dicom.collection_species AS tcia_species,
2424
dicom.gcs_url,
2525
dicom.aws_url,
2626
dicom.Manufacturer,
2727
dicom.ManufacturerModelName,
28+
dicom.gcs_bucket,
29+
dicom.aws_bucket,
2830
collex.Program AS program,
29-
REPLACE(collection_id,"-","_") AS collection_id,
31+
REPLACE(dicom.collection_id,"-","_") AS collection_id,
3032
analysis.ID AS analysis_results_id,
3133
curated_series.illuminationType_code_designator_value_str AS illuminationType,
3234
curated_series.primaryAnatomicStructure_code_designator_value_str AS primaryAnatomicStructure,
3335
curated_series.ObjectiveLensPower,
3436
curated_series.min_PixelSpacing_2sf AS min_PixelSpacing,
3537
curated_series.max_TotalPixelMatrixColumns,
3638
curated_series.max_TotalPixelMatrixRows,
37-
Internal_structure, Sphericity, Calcification, Lobular_Pattern, Spiculation, Margin,
38-
Texture, Subtlety_score, Malignancy, Volume, Diameter, Surface_area_of_mesh,
39-
Apparent_Diffusion_Coefficient, segs.AnatomicRegionSequence, SegmentedPropertyCategoryCodeSequence,
40-
SegmentedPropertyTypeCodeSequence, segs.FrameOfReferenceUID,
41-
SegmentNumber, SegmentAlgorithmType, Sphericity_quant, Volume_of_Mesh,
42-
CASE WHEN collection_id LIKE 'tcga_%' THEN "True" ELSE "False" END AS has_related,
43-
CASE WHEN qual.SOPInstanceUID IS NULL THEN 'False' ELSE 'True' END AS has_qualitative,
44-
CASE WHEN quan.SOPInstanceUID IS NULL THEN 'False' ELSE 'True' END AS has_quantitative,
45-
CASE WHEN segs.AnatomicRegionSequence IS NULL AND SegmentedPropertyCategoryCodeSequence IS NULL
46-
AND SegmentedPropertyTypeCodeSequence IS NULL THEN 'False' else 'True' END AS has_segmentation,
47-
CASE WHEN qual.SOPInstanceUID IS NULL AND quan.SOPInstanceUID IS NULL
48-
AND segs.SOPInstanceUID IS NULL THEN 'False' else 'True' END AS has_derived,
49-
dicom.access AS access,
50-
REGEXP_EXTRACT(gcs_url, r'^[a-zA-Z0-9-_]+://([a-zA-Z0-9-_]+)/') gcs_bucket,
51-
REGEXP_EXTRACT(aws_url, r'^[a-zA-Z0-9-_]+://([a-zA-Z0-9-_]+)/') aws_bucket
52-
FROM `idc-pdp-staging.{dataset}.dicom_all` dicom
53-
LEFT JOIN `idc-dev-etl.{dataset}_dev.qualitative_pivot` qual
39+
Internal_structure,
40+
Sphericity,
41+
Calcification,
42+
Lobular_Pattern,
43+
Spiculation,
44+
Margin,
45+
Texture,
46+
Subtlety_score,
47+
Malignancy,
48+
Volume,
49+
Diameter,
50+
Surface_area_of_mesh,
51+
Apparent_Diffusion_Coefficient,
52+
segs.AnatomicRegionSequence,
53+
SegmentedPropertyCategoryCodeSequence,
54+
SegmentedPropertyTypeCodeSequence,
55+
segs.FrameOfReferenceUID,
56+
SegmentNumber,
57+
SegmentAlgorithmType,
58+
SegmentAlgorithmName,
59+
Sphericity_quant,
60+
Volume_of_Mesh,
61+
CASE WHEN dicom.collection_id LIKE 'tcga_%' THEN "True" ELSE "False" END AS has_related,
62+
CASE WHEN qual.SOPInstanceUID IS NULL THEN 'False' ELSE 'True' END AS has_qualitative,
63+
CASE WHEN quan.SOPInstanceUID IS NULL THEN 'False' ELSE 'True' END AS has_quantitative,
64+
CASE WHEN segs.AnatomicRegionSequence IS NULL AND SegmentedPropertyCategoryCodeSequence IS NULL
65+
AND SegmentedPropertyTypeCodeSequence IS NULL THEN 'False' else 'True' END AS has_segmentation,
66+
CASE WHEN qual.SOPInstanceUID IS NULL AND quan.SOPInstanceUID IS NULL
67+
AND segs.SOPInstanceUID IS NULL THEN 'False' else 'True' END AS has_derived,
68+
dicom.access AS access
69+
FROM `idc-pdp-staging.{data_version}.dicom_all` dicom
70+
LEFT JOIN `idc-dev-etl.{data_version}_dev.qualitative_pivot` qual
5471
ON qual.SOPInstanceUID = dicom.SOPInstanceUID
55-
LEFT JOIN `idc-dev-etl.{dataset}_dev.quantitative_pivot` quan
72+
LEFT JOIN `idc-dev-etl.{data_version}_dev.quantitative_pivot` quan
5673
ON quan.SOPInstanceUID = dicom.SOPInstanceUID
57-
LEFT JOIN `idc-dev-etl.{dataset}_dev.segmentations_pivot` segs
74+
LEFT JOIN `idc-dev-etl.{data_version}_dev.segmentations_pivot` segs
5875
ON segs.SOPInstanceUID = dicom.SOPInstanceUID
59-
LEFT JOIN `idc-pdp-staging.{dataset}.original_collections_metadata` collex
60-
ON collex.idc_webapp_collection_id = dicom.collection_id
61-
LEFT JOIN `idc-pdp-staging.{dataset}.analysis_results_metadata` analysis
62-
ON LOWER(analysis.DOI) = LOWER(dicom.source_DOI)
63-
LEFT JOIN `idc-pdp-staging.{dataset}.dicom_metadata_curated` curated
76+
LEFT JOIN `idc-pdp-staging.{data_version}.original_collections_metadata` collex
77+
ON collex.collection_id = dicom.collection_id
78+
LEFT JOIN `idc-pdp-staging.{data_version}.analysis_results_metadata` analysis
79+
ON LOWER(analysis.source_doi) = LOWER(dicom.source_doi)
80+
LEFT JOIN `idc-pdp-staging.{data_version}.dicom_metadata_curated` curated
6481
ON curated.SOPInstanceUID = dicom.SOPInstanceUID
65-
LEFT JOIN `idc-pdp-staging.{dataset}.dicom_metadata_curated_series_level_view` curated_series
82+
LEFT JOIN `idc-pdp-staging.{data_version}.dicom_metadata_curated_series_level_view` curated_series
6683
ON curated_series.SeriesInstanceUID = dicom.SeriesInstanceUID
6784
;

0 commit comments

Comments
 (0)