From 49d38c4d503734f041cdb0036e6c5f3176b45264 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Wed, 20 Sep 2023 15:31:51 +0200
Subject: [PATCH 1/5] add stackexchange code

---
 data_analysis/stackoverflow/h4_code/README.md |  26 +
 .../stackoverflow/h4_code/binarize.py         | 117 +++
 .../h4_code/stack_exchange_explore.py         | 305 ++++++++
 .../h4_code/stack_exchange_process.py         | 718 ++++++++++++++++++
 data_analysis/stackoverflow/other/main.py     | 195 +++++
 .../stackoverflow/other/requirements.txt      |   5 +
 .../stackoverflow/other/se_reference_utils.py | 347 +++++++++
 7 files changed, 1713 insertions(+)
 create mode 100644 data_analysis/stackoverflow/h4_code/README.md
 create mode 100644 data_analysis/stackoverflow/h4_code/binarize.py
 create mode 100644 data_analysis/stackoverflow/h4_code/stack_exchange_explore.py
 create mode 100644 data_analysis/stackoverflow/h4_code/stack_exchange_process.py
 create mode 100644 data_analysis/stackoverflow/other/main.py
 create mode 100644 data_analysis/stackoverflow/other/requirements.txt
 create mode 100644 data_analysis/stackoverflow/other/se_reference_utils.py

diff --git a/data_analysis/stackoverflow/h4_code/README.md b/data_analysis/stackoverflow/h4_code/README.md
new file mode 100644
index 0000000..e918771
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/README.md
@@ -0,0 +1,26 @@
+# Scripts for preference model pretraining data
+
+### Stack Exchange
+Note: Stack Exchange Data Dump has a license requiring the addition of author's and links to the original material, see more [here](https://archive.org/details/stackexchange).
+
+1) `stack_exchange_explore.py`: example script for filtering stack exchange data to the question & answer format in Askell et al. 2021 on preference model pretraining (PMP).
+
+To run this code (from scratch including data download and faster processing), do the following:
+Identify the raw data directory you're hoping to process, `ex_data_url`, and related data variables (further string optimizations can be added).
+The script will pull raw data if you need it, uncompress it, and process the file to text.
+
+```shell
+python scripts/data/pmp/stack_exchange_explore.py --stack_exchange=pets --save=True
+```
+
+2) `stack_exchange_process.py`: same as above, but designed to be run on a large machine to process all files consecutively.
+It is a long for-loop over desired exchanges.
+
+```shell
+python scripts/data/pmp/stack_exchange_process.py --save_path=/path/to/hf-dataset
+```
+
+3) `binarize.py`: used to binarize the pre-filter Stack Exchange data (and in the future, Reddit / Wikipedia)
+```shell
+python scripts/data/pmp/binarize.py --save_path=/path/to/hf-dataset
+```
\ No newline at end of file
diff --git a/data_analysis/stackoverflow/h4_code/binarize.py b/data_analysis/stackoverflow/h4_code/binarize.py
new file mode 100644
index 0000000..79bcce8
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/binarize.py
@@ -0,0 +1,117 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+from datasets import Dataset, concatenate_datasets, load_dataset
+
+from h4.data.utils import save_dataset_shards
+
+
+H4_DIR = Path(__file__).resolve().parents[3]
+DATA_DIR = H4_DIR / "data"
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--debug", action="store_true", help="Added print statements / limit data size for debugging")
+    parser.add_argument(
+        "--output_dir",
+        default=f"{DATA_DIR}/pmp-binarized",
+        type=str,
+        help="Where to save the processed dataset",
+    )
+    parser.add_argument(
+        "--exchange_name",
+        type=str,
+        default=None,
+        help="Optional argument to specify a specific subsection of the dataset",
+    )
+    parser.add_argument(
+        "--binary_score", type=int, default=8, help="Score assigned to binarized pairs for preference data."
+    )
+    parser.add_argument(
+        "--stream_data", action="store_true", help="Optionally stream data, which can be useful with weaker computers"
+    )
+    parser.set_defaults(debug=False, stream_data=False)  # default will process full dataset
+
+    args = parser.parse_args()
+    specific_exchange = args.exchange_name
+    stream_dataset = args.stream_data
+    binary_score = args.binary_score
+
+    if specific_exchange:
+        data_dir = "data/" + args.exchange_name
+    else:
+        data_dir = None
+
+    if args.debug:
+        data_len_limit = 10000
+    else:
+        data_len_limit = np.inf
+
+    dataset = load_dataset(
+        "HuggingFaceH4/pmp-stack-exchange",
+        data_dir=data_dir,
+        split="train",
+        streaming=stream_dataset,
+    )
+
+    pmp_data = []
+    for i, d in enumerate(iter(dataset)):
+        # check debug limit, quit if in debug mode (don't save)
+        if i > data_len_limit:
+            print("Early exit for debug mode!")
+            print(pmp_data)
+            break
+
+        question = d["question"]
+        answers = d["answers"]
+        num_answers = len(answers)
+
+        answer_scores = [a["pm_score"] for a in answers]
+        if len(np.unique(answer_scores)) < 2:
+            print(f"PM Scores are {answer_scores}, skipping this question {i}")
+        else:
+            # Sample 2 unique scores for binarization
+            dif_scores = False
+            while not dif_scores:
+                # print("infinite loop...?")
+                two_answers = random.sample(answers, 2)
+
+                if two_answers[0]["pm_score"] != two_answers[1]["pm_score"]:
+                    dif_scores = True
+
+        answer_0 = two_answers[0]
+        answer_1 = two_answers[1]
+        text_0 = "Question: " + question + "\n" + "Answer: " + answer_0["text"]
+        text_1 = "Question: " + question + "\n" + "Answer: " + answer_1["text"]
+        score_0 = binary_score
+        score_1 = binary_score
+
+        pmp_data.append({"context": text_0, "score": score_0})
+        pmp_data.append({"context": text_1, "score": score_1})
+
+    # Save binarized data
+    sublist_len = 100000
+
+    print(f"Dataset length is {len(pmp_data)}")
+    # bypass known issue in arrow https://issues.apache.org/jira/browse/ARROW-17137
+    print(f"Processed dataset length > {sublist_len}, processing to HF dataset in chunks")
+    chunks = [pmp_data[x : x + sublist_len] for x in range(0, len(pmp_data), sublist_len)]
+    ds_chunks = [Dataset.from_list(ch) for ch in chunks]
+    ds = concatenate_datasets(ds_chunks)
+
+    save_dataset_shards(ds, args.output_dir, subset="stackexchange", shard_size="100MB")
diff --git a/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py b/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py
new file mode 100644
index 0000000..33a8746
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py
@@ -0,0 +1,305 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import time
+
+
+try:
+    from lxml import etree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+from argparse import ArgumentParser
+
+import numpy as np
+
+
+parser = ArgumentParser()
+parser.add_argument("--stack_exchange", default="ai", type=str, help="Which stack exchange data to process")
+parser.add_argument(
+    "--save_to_text", default=False, type=bool, help="Whether or not the outputs are saved to a text file."
+)
+parser.add_argument("--debug", default=False, type=bool, help="Added print statements for debugging")
+
+args = parser.parse_args()
+
+save = args.save_to_text
+se_name = args.stack_exchange + ".stackexchange.com"
+DEBUG = args.debug
+
+
+start_time = time.time()
+
+data_dir = "data/"
+if not os.path.exists(data_dir):
+    os.mkdir(data_dir)
+
+# check if unpacked data exists:
+ex_data_file = data_dir + se_name + "/Posts.xml"
+if not os.path.exists(ex_data_file):
+    # get raw data
+    ex_data_file_7z = se_name + ".7z"
+    if not os.path.exists(data_dir + ex_data_file_7z):
+        print("Loading raw data, this can take a second!")
+        import py7zr
+        import requests
+
+        ex_data_url = (
+            "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/resolve/main/"
+            + ex_data_file_7z
+        )
+        response = requests.get(ex_data_url, allow_redirects=True)
+        filename = os.path.basename(ex_data_url)
+
+        if response.status_code == 200:
+            with open(data_dir + filename, "wb") as out:
+                out.write(response.content)
+            os.mkdir(data_dir + se_name)
+            with py7zr.SevenZipFile(data_dir + filename, "r") as archive:
+                archive.extractall(data_dir + se_name + "/")
+        else:
+            print("Request failed: %d" % response.status_code)
+
+        print("Loaded data, now processing!")
+
+# load extracted xml files
+local_path = data_dir + se_name + "/"  # "ai.stackexchange.com/"
+posts_subpath = "Posts.xml"
+votes_subpath = "Votes.xml"
+users_subpath = "Users.xml"
+
+"""
+XML file structure:
+* PostTypeID ranges from 1: Question, 2: Answer, ....
+* We only want posts with AcceptedAnswerId fields
+
+(docs https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede)
+"""
+
+
+def print_dict(d):
+    for key, val in d.items():
+        print(f"{key}, {val}")
+
+
+def simplify_date(date_string):
+    date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S")
+    return date.strftime("%Y/%m/%d")
+
+
+user_info = {-1: "(user-deleted)"}
+question_info = {}
+answer_info = {}
+
+# extract user data for license
+with open(local_path + users_subpath, "rb") as f:  # Users file
+    tree = ET.parse(f)
+    for exchange in tree.iter("row"):
+        tag = int(exchange.attrib["Id"])
+        user_info[tag] = str(exchange.attrib["DisplayName"])
+
+if DEBUG:
+    print_dict(user_info)
+
+with open(local_path + posts_subpath, "rb") as f:  # Posts file
+    tree = ET.parse(f)
+
+    # process questions, find answers next
+    # note, could do this all in one loop and store anything is memory is cheaper than processing speed
+
+    # iterator through all rows
+    for exchange in tree.iter("row"):
+        # find 2+ answers
+        if "AnswerCount" in exchange.attrib:
+            ans_count = int(exchange.attrib["AnswerCount"])
+
+            # only save questions with >= 2 answers
+            if ans_count >= 2:
+                tag = int(exchange.attrib["Id"])
+
+                result = {}
+                result["Body"] = exchange.attrib["Body"]
+
+                # store some metadata
+                result["AnswerCount"] = ans_count
+                result["PostScore"] = int(exchange.attrib["Score"])
+
+                # save metadata
+                if "OwnerUserId" in exchange.attrib:
+                    user_id = int(exchange.attrib["OwnerUserId"])
+                else:
+                    user_id = -1  # deleted user redirect to community page
+
+                result["Author"] = user_id  # should fail for some deleted entries
+                result["metadata"] = [
+                    "https://" + se_name + "/questions/" + str(tag),
+                    "https://" + se_name,
+                    "https://"
+                    + se_name
+                    + "/users/"
+                    + str(user_id)
+                    + "/",  # don't include username afterwards to avoid case with spaces in name (string regex problem)
+                ]
+                result["Date"] = simplify_date(exchange.attrib["CreationDate"])
+
+                # if accepted answer, store it
+                if "AcceptedAnswerId" in exchange.attrib:
+                    accepted_ans = int(exchange.attrib["AcceptedAnswerId"])
+                    result["AcceptedAnswerId"] = accepted_ans
+                else:
+                    result["AcceptedAnswerId"] = None
+
+                question_info[tag] = result
+                if DEBUG:
+                    print_dict(question_info[tag])
+
+    # process looking for answers
+    for i, exchange in enumerate(tree.iter("row")):
+        # answers are ID type 2
+        if int(exchange.attrib["PostTypeId"]) == 2:
+            # get parent, check if in question_info
+            parent = int(exchange.attrib["ParentId"])
+            # note, that parent will be same as tag above in answer_info and question_info
+
+            # log if parent is in questions (multiple answers for preference model)
+            if parent in question_info:
+                # info for answers
+                ans_text = exchange.attrib["Body"]
+                ans_score = int(exchange.attrib["Score"])
+                ans_id = int(exchange.attrib["Id"])  # extra score if this ID matches accept id above
+
+                # save metadata
+                if "OwnerUserId" in exchange.attrib:
+                    user_id = int(exchange.attrib["OwnerUserId"])
+                else:
+                    user_id = -1  # deleted user
+                # we'll need to store multiple answers per tag
+                if parent not in answer_info:
+                    answer_info[parent] = {}
+                    answer_info[parent]["Text"] = []
+                    answer_info[parent]["Score"] = []
+                    answer_info[parent]["Id"] = []
+                    answer_info[parent]["Author"] = []
+                    answer_info[parent]["AuthorNames"] = []
+
+                answer_info[parent]["Text"].append(ans_text)
+                answer_info[parent]["Score"].append(ans_score)
+                answer_info[parent]["Id"].append(ans_id)
+                answer_info[parent]["Author"].append(user_id)  # should fail for some deleted entries
+                answer_info[parent]["AuthorNames"].append(user_info[user_id])
+
+                if DEBUG:
+                    print_dict(answer_info[parent])
+
+# don't debug and save
+if DEBUG:
+    quit()
+
+qa_keys = question_info.keys()
+if save:
+    import json
+
+    output_file = open(data_dir + "output.jsonl", "w")
+
+final_outputs = {"domain": args.stack_exchange}
+print(" ------ printing processed questions ------ ------ ------ ------ ------ ------ ")
+for k in qa_keys:
+    question_data = question_info[k]
+    if not save:
+        print("  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+        print(f"Question (id: {k}): {question_data['Body']}")
+
+    accepted_ans = question_data["AcceptedAnswerId"]
+
+    answer_data = answer_info[k]
+    metadata = question_data["metadata"]
+    date = question_data["Date"]
+    # filter for number of unique scores to be >= 2 (per paper)
+    scores = answer_data["Score"]
+    if len(np.unique(scores)) >= 2:
+        answers = []
+        for i, (text, score, ans_id, auth_name, auth_id) in enumerate(
+            zip(answer_data["Text"], scores, answer_data["Id"], answer_data["AuthorNames"], answer_data["Author"])
+        ):
+            sub_answer = {}
+            accepted = accepted_ans == ans_id
+
+            if score >= 0:
+                s = round(np.log2(1 + score))
+
+                # not documented if negative answers can be accepted, assuming no
+                if accepted:  # add 1 to score if answer was accepted
+                    s += 1
+            else:
+                s = -1
+
+            # print or save, *** indicates preferred answer
+            pref = ", ***" if accepted else ""
+            sub_answer["AnswerID"] = ans_id
+            sub_answer["text"] = text
+            sub_answer["pm_score"] = s
+            sub_answer["selected"] = accepted
+            sub_answer["Author"] = auth_name
+            sub_answer["AuthorID"] = auth_id
+            sub_answer["AuthorProfile"] = "https://" + se_name + "/users/" + str(auth_id)
+            answers.append(sub_answer)
+            if not save:
+                print(f"Answer (id {ans_id}, s:{s}{pref}): {text}")
+                print("  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+
+        if save:
+            json_obj = {
+                "qid": k,
+                "question": question_data["Body"],
+                "answers": answers,
+                "date": date,
+                "metadata": metadata,
+            }
+            json.dump(json_obj, output_file)
+
+print(f"finished at {time.time() - start_time}s")
+"""
+Added options/notes for scaling & changing this script
+
+Adding a dataloader to use HuggingFace Datasets
+`from datasets import load_dataset`
+-----
+
+Logs on loading 7z files:
+Example for samsum dataset::
+https://github.com/huggingface/datasets/blob/fedf891a08bfc77041d575fad6c26091bc0fce52/datasets/samsum/samsum.py#L106-L110
+-----
+
+Making a cleaner repo + dataloader out of the raw data here:
+https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/tree/main
+* move many files into folder (how to do that without loading)?
+* add data loader (see above, shouldn't be so hard)
+* figure out storage datatype of the processed data
+----
+
+Maybe consider using Beautiful Soup?
+https://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
+# list files in the raw repository
+from huggingface_hub import HfApi
+api = HfApi()
+
+se_files = api.list_repo_files("flax-sentence-embeddings/stackexchange_xml", repo_type="dataset")
+se_data_files = [f for f in se_files if "7z" in f]
+se_names = [f[:f.find(".")] for f in se_files if "7z" in f]
+se_names = [f + ".meta" if (i%2) == 0 else f for i, f in enumerate(se_names)]
+# print(se_data_files)
+
+"""
diff --git a/data_analysis/stackoverflow/h4_code/stack_exchange_process.py b/data_analysis/stackoverflow/h4_code/stack_exchange_process.py
new file mode 100644
index 0000000..11d7f31
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/stack_exchange_process.py
@@ -0,0 +1,718 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import time
+
+from datasets import Dataset, concatenate_datasets
+
+import py7zr
+import requests
+from h4.data.utils import save_dataset_shards
+
+
+try:
+    from lxml import etree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+
+
+H4_DIR = Path(__file__).resolve().parents[3]
+# TODO: Ideally we would use PosixPath here, but it doesn't work with the way the script is implemented :)
+DATA_DIR = str(H4_DIR) + "/data/pmp-stack-exchange/"
+
+# stack exchanges we filter
+ALL_EXCHANGES = [
+    "3dprinting.meta",
+    "3dprinting",
+    "academia.meta",
+    "academia",
+    "ai.meta",
+    "ai",
+    "android.meta",
+    "android",
+    "anime.meta",
+    "anime",
+    "apple.meta",
+    "apple",
+    "arduino.meta",
+    "arduino",
+    "askubuntu",
+    "astronomy",
+    "astronomy.meta",
+    "aviation",
+    "aviation.meta",
+    "avp",
+    "avp.meta",
+    "beer",
+    "beer.meta",
+    "bicycles",
+    "bicycles.meta",
+    "bioinformatics",
+    "bioinformatics.meta",
+    "biology",
+    "biology.meta",
+    "bitcoin",
+    "bitcoin.meta",
+    "blender",
+    "blender.meta",
+    "boardgames",
+    "boardgames.meta",
+    "bricks",
+    "bricks.meta",
+    "buddhism",
+    "buddhism.meta",
+    "cardano",
+    "cardano.meta",
+    "chemistry",
+    "chemistry.meta",
+    "chess",
+    "chess.meta",
+    "chinese",
+    "chinese.meta",
+    "christianity",
+    "christianity.meta",
+    "civicrm",
+    "civicrm.meta",
+    "codegolf",
+    "codegolf.meta",
+    "codereview",
+    "codereview.meta",
+    "coffee",
+    "coffee.meta",
+    "cogsci",
+    "cogsci.meta",
+    "computergraphics",
+    "computergraphics.meta",
+    "conlang",
+    "conlang.meta",
+    "cooking",
+    "cooking.meta",
+    "craftcms",
+    "craftcms.meta",
+    "crafts",
+    "crafts.meta",
+    "crypto",
+    "crypto.meta",
+    "cs",
+    "cs.meta",
+    "cseducators",
+    "cseducators.meta",
+    "cstheory",
+    "cstheory.meta",
+    "datascience",
+    "datascience.meta",
+    "dba",
+    "dba.meta",
+    "devops",
+    "devops.meta",
+    "diy",
+    "diy.meta",
+    "drones",
+    "drones.meta",
+    "drupal",
+    "drupal.meta",
+    "dsp",
+    "dsp.meta",
+    "earthscience",
+    "earthscience.meta",
+    "ebooks",
+    "ebooks.meta",
+    "economics",
+    "economics.meta",
+    "electronics",
+    "electronics.meta",
+    "elementaryos",
+    "elementaryos.meta",
+    "ell",
+    "ell.meta",
+    "emacs",
+    "emacs.meta",
+    "engineering",
+    "engineering.meta",
+    "english",
+    "english.meta",
+    "eosio",
+    "eosio.meta",
+    "esperanto",
+    "esperanto.meta",
+    "ethereum",
+    "ethereum.meta",
+    "expatriates",
+    "expatriates.meta",
+    "expressionengine",
+    "expressionengine.meta",
+    "fitness",
+    "fitness.meta",
+    "freelancing",
+    "freelancing.meta",
+    "french",
+    "french.meta",
+    "gamedev",
+    "gamedev.meta",
+    "gaming",
+    "gaming.meta",
+    "gardening",
+    "gardening.meta",
+    "genealogy",
+    "genealogy.meta",
+    "german",
+    "german.meta",
+    "gis",
+    "gis.meta",
+    "graphicdesign",
+    "graphicdesign.meta",
+    "ham",
+    "ham.meta",
+    "hardwarerecs",
+    "hardwarerecs.meta",
+    "health",
+    "health.meta",
+    "hermeneutics",
+    "hermeneutics.meta",
+    "hinduism",
+    "hinduism.meta",
+    "history",
+    "history.meta",
+    "homebrew",
+    "homebrew.meta",
+    "hsm",
+    "hsm.meta",
+    "interpersonal",
+    "interpersonal.meta",
+    "iot",
+    "iot.meta",
+    "iota",
+    "iota.meta",
+    "islam",
+    "islam.meta",
+    "italian",
+    "italian.meta",
+    "japanese",
+    "japanese.meta",
+    "joomla",
+    "joomla.meta",
+    "judaism",
+    "judaism.meta",
+    "korean",
+    "korean.meta",
+    "languagelearning",
+    "languagelearning.meta",
+    "latin",
+    "latin.meta",
+    "law",
+    "law.meta",
+    "lifehacks",
+    "lifehacks.meta",
+    "linguistics",
+    "linguistics.meta",
+    "literature",
+    "literature.meta",
+    "magento",
+    "magento.meta",
+    "martialarts",
+    "martialarts.meta",
+    "materials",
+    "materials.meta",
+    "math",
+    "math.meta",
+    "matheducators",
+    "matheducators.meta",
+    "mathematica",
+    "mathematica.meta",
+    "mathoverflow",
+    "mechanics.meta",
+    "mechanics",
+    "meta.askubuntu",
+    "meta.mathoverflow",
+    "meta.serverfault",
+    "meta.stackexchange",
+    "meta.stackoverflow",
+    "meta.superuser",
+    "moderators.meta",
+    "moderators",
+    "monero.meta",
+    "monero",
+    "money.meta",
+    "money",
+    "movies.meta",
+    "movies",
+    "music.meta",
+    "music",
+    "musicfans.meta",
+    "musicfans",
+    "mythology.meta",
+    "mythology",
+    "networkengineering.meta",
+    "networkengineering",
+    "opendata.meta",
+    "opendata",
+    "opensource.meta",
+    "opensource",
+    "or.meta",
+    "or",
+    "outdoors.meta",
+    "outdoors",
+    "parenting.meta",
+    "parenting",
+    "patents.meta",
+    "patents",
+    "pets.meta",
+    "pets",
+    "philosophy.meta",
+    "philosophy",
+    "photo.meta",
+    "photo",
+    "physics.meta",
+    "physics",
+    "pm.meta",
+    "pm",
+    "poker.meta",
+    "poker",
+    "politics.meta",
+    "politics",
+    "portuguese.meta",
+    "portuguese",
+    "puzzling.meta",
+    "puzzling",
+    "quant.meta",
+    "quant",
+    "quantumcomputing.meta",
+    "quantumcomputing",
+    "raspberrypi.meta",
+    "raspberrypi",
+    "retrocomputing.meta",
+    "retrocomputing",
+    "reverseengineering.meta",
+    "reverseengineering",
+    "robotics.meta",
+    "robotics",
+    "rpg.meta",
+    "rpg",
+    "rus.meta",
+    "rus",
+    "russian.meta",
+    "russian",
+    "salesforce.meta",
+    "salesforce",
+    "scicomp.meta",
+    "scicomp",
+    "scifi.meta",
+    "scifi",
+    "security.meta",
+    "security",
+    "serverfault",
+    "sharepoint",
+    "sharepoint.meta",
+    "sitecore",
+    "sitecore.meta",
+    "skeptics",
+    "skeptics.meta",
+    "softwareengineering",
+    "softwareengineering.meta",
+    "softwarerecs",
+    "softwarerecs.meta",
+    "sound",
+    "sound.meta",
+    "space",
+    "space.meta",
+    "spanish",
+    "spanish.meta",
+    "sports",
+    "sports.meta",
+    "sqa",
+    "sqa.meta",
+    "stackapps",
+    "stats.meta",
+    "stats",
+    "stellar.meta",
+    "stellar",
+    "superuser",
+    "sustainability",
+    "sustainability.meta",
+    "tex",
+    "tex.meta",
+    "tezos",
+    "tezos.meta",
+    "tor",
+    "tor.meta",
+    "travel",
+    "travel.meta",
+    "tridion",
+    "tridion.meta",
+    "ukrainian",
+    "ukrainian.meta",
+    "unix",
+    "unix.meta",
+    "ux",
+    "ux.meta",
+    "vegetarianism",
+    "vegetarianism.meta",
+    "vi",
+    "vi.meta",
+    "webapps",
+    "webapps.meta",
+    "webmasters",
+    "webmasters.meta",
+    "windowsphone",
+    "windowsphone.meta",
+    "woodworking",
+    "woodworking.meta",
+    "wordpress",
+    "wordpress.meta",
+    "workplace",
+    "workplace.meta",
+    "worldbuilding",
+    "worldbuilding.meta",
+    "writers",
+    "writers.meta",
+    "Stackoverflow",  # hardcoded for different URL structure
+]
+
+# Some excluded stack exchanges below (not a maintained list)
+# spanish: es.meta.stackoverflow.com.7z, es.stackoverflow.com.7z
+# japanese: ja.meta.stackoverflow.com.7z, ja.stackoverflow.com.7z
+# some language: pt.stackoverflow.com, pt.meta.stackoverflow.com
+# ru.stackoverflow, ru.meta.stackoverflow
+
+# stack exchanges with different processing, these end in .net ;(
+DOTNET_LIST = ["mathoverflow", "meta.mathoverflow"]
+
+# stack exchanges without .stackoverflow.com (includes above)
+SHORT_URL_LIST = [
+    "askubuntu",
+    "meta.askubuntu",
+    "meta.serverfault",
+    "meta.stackexchange",
+    "meta.stackoverflow",
+    "stackexchange",
+    "superuser",
+    "meta.superuser",
+    "serverfault",
+    "stackapps",
+    "Stackoverflow",
+]
+SHORT_URL_LIST += DOTNET_LIST
+
+
+def get_and_unpack_7z(directory: str, data_save_dir: str, save_dir_override: str = None):
+    # check if unpacked data exists (no need to re-download):
+    se_name_7z = directory[directory.rfind("/") + 1 :]
+    se_name = se_name_7z[:-3]
+    assert ".7z" == se_name_7z[-3:]
+    if not os.path.exists(data_save_dir + se_name_7z):
+        print("Loading raw data, this can take a second!")
+
+        ex_data_url = (
+            # "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/resolve/main/"\
+            "https://archive.org/download/stackexchange/"
+            + se_name_7z
+        )
+
+        response = requests.get(ex_data_url, allow_redirects=True)
+        filename = os.path.basename(ex_data_url)
+
+        print("Unpacking raw data.")
+        if response.status_code == 200:
+            with open(DATA_DIR + filename, "wb") as out:
+                out.write(response.content)
+            os.mkdir(DATA_DIR + se_name)
+            with py7zr.SevenZipFile(DATA_DIR + filename, "r") as archive:
+                if save_dir_override:
+                    save_dir = save_dir_override
+                else:
+                    save_dir = se_name
+                archive.extractall(DATA_DIR + save_dir + "/")
+        else:
+            print("Request failed: %d" % response.status_code)
+
+        print("Loaded & unpacked data, now processing...")
+    else:
+        print("Raw 7z data already exists for this dir :)")
+
+
+def print_dict(d):
+    for key, val in d.items():
+        print(f"{key}, {val}")
+
+
+def simplify_date(date_string):
+    date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S")
+    return date.strftime("%Y/%m/%d")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="If the script will process all stack exchanges: warning, requires large amount of RAM",
+    )
+    parser.add_argument("--save_path", default=DATA_DIR, type=str, help="Path to the huggingface dataset preferably.")
+    parser.add_argument(
+        "--start_idx",
+        default=0,
+        type=int,
+        help="Optional value to skip a number of exchanges in the above list if processing crashed midway",
+    )
+    parser.add_argument("--shard_size", default=100, type=int, help="Maximum size of file for subsets of data in MB")
+    parser.add_argument("--debug", action="store_true", help="Added print statements for debugging")
+    parser.set_defaults(debug=False, all=False)
+
+    args = parser.parse_args()
+
+    shard_size = str(args.shard_size) + "MB"
+    process_all = args.all
+    save_path = args.save_path
+    start_idx = args.start_idx
+    DEBUG = args.debug
+    if process_all:
+        se_list = ALL_EXCHANGES
+    else:
+        print("Run from command line with --all=True to process all data")
+        se_list = ["ai", "apple", "pets", "ai.meta"]
+
+    os.makedirs(DATA_DIR, exist_ok=True)
+
+    # Process all exchanges in loop (saves in memory)
+    TOTAL = len(se_list) - 1
+    for i, se_sub_name in enumerate(se_list[start_idx:]):
+        print(f"SECTION {i + start_idx}/{TOTAL}: {se_sub_name} - START")
+
+        # some stack exchanges don't use .stackexchange.com
+        if se_sub_name not in SHORT_URL_LIST:
+            se_full_name = se_sub_name + ".stackexchange.com"
+        elif se_sub_name in DOTNET_LIST:  # two exchanges need .net
+            se_full_name = se_sub_name + ".net"
+        else:
+            se_full_name = se_sub_name + ".com"
+
+        start_time = time.time()
+        full_section_data = []
+
+        # https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z
+        # https://archive.org/download/stackexchange/Stackoverflow.com-Users.7z
+
+        # get_and_unpack_7z()
+        ex_data_file = DATA_DIR + se_full_name + "/Users.xml"
+        # check if unpacked data exists:
+        if not os.path.exists(ex_data_file):
+            # get raw data
+            ex_data_file_7z = se_full_name + ".7z"
+            if "Stackoverflow.com" in ex_data_file_7z:
+                base_stackoverflow_dir = ex_data_file_7z[:-3]
+                get_and_unpack_7z(
+                    base_stackoverflow_dir + "-Posts.7z", DATA_DIR, save_dir_override="stackoverflow.com"
+                )
+                get_and_unpack_7z(
+                    base_stackoverflow_dir.lower() + "-Users.7z", DATA_DIR, save_dir_override="stackoverflow.com"
+                )  # users dir only is lowercase s
+            else:
+                get_and_unpack_7z(ex_data_file_7z, DATA_DIR)
+
+        # load extracted xml files
+        local_path = (
+            DATA_DIR + se_full_name.lower() + "/"
+        )  # "ai.stackexchange.com/" # again, .lower() for the Stackexchange.com/Users
+        posts_subpath = "Posts.xml"
+        users_subpath = "Users.xml"
+
+        """
+        XML file structure:
+        * PostTypeID ranges from 1: Question, 2: Answer, ....
+        * We only want posts with AcceptedAnswerId fields
+        (docs https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede)
+        """
+
+        user_info = {-1: "(user-deleted)"}
+        question_info = {}
+        answer_info = {}
+
+        # extract user data for license
+        with open(local_path + users_subpath, "rb") as f:  # Users file
+            tree = ET.parse(f)
+            for exchange in tree.iter("row"):
+                tag = int(exchange.attrib["Id"])
+                user_info[tag] = str(exchange.attrib["DisplayName"])
+
+        if DEBUG:
+            print_dict(user_info)
+
+        with open(local_path + posts_subpath, "rb") as f:  # Posts file
+            tree = ET.parse(f)
+
+            # process questions, find answers next
+            # note, could do this all in one loop and store anything is memory is cheaper than processing speed
+
+            # iterator through all rows
+            for exchange in tree.iter("row"):
+                # find 2+ answers
+                if "AnswerCount" in exchange.attrib:
+                    ans_count = int(exchange.attrib["AnswerCount"])
+
+                    # only save questions with >= 2 answers
+                    if ans_count >= 2:
+                        tag = int(exchange.attrib["Id"])
+
+                        result = {}
+                        result["Body"] = exchange.attrib["Body"]
+
+                        # store some metadata
+                        result["AnswerCount"] = ans_count
+                        result["PostScore"] = int(exchange.attrib["Score"])
+
+                        # save metadata
+                        if "OwnerUserId" in exchange.attrib:
+                            user_id = int(exchange.attrib["OwnerUserId"])
+                        else:
+                            user_id = -1  # deleted user redirect to community page
+
+                        result["Author"] = user_id  # should fail for some deleted entries
+                        result["metadata"] = [
+                            "https://" + se_full_name + "/questions/" + str(tag),  # question URL
+                            "https://" + se_full_name,  # Exchange URL
+                            "https://"
+                            + se_full_name
+                            + "/users/"
+                            + str(user_id)
+                            + "/",  # Author URL -- don't include username afterwards to avoid case with spaces in name (string regex problem)
+                        ]
+                        result["Date"] = simplify_date(exchange.attrib["CreationDate"])
+
+                        # if accepted answer, store it
+                        if "AcceptedAnswerId" in exchange.attrib:
+                            accepted_ans = int(exchange.attrib["AcceptedAnswerId"])
+                            result["AcceptedAnswerId"] = accepted_ans
+                        else:
+                            result["AcceptedAnswerId"] = None
+
+                        question_info[tag] = result
+                        if DEBUG:
+                            print_dict(question_info[tag])
+
+            # process looking for answers
+            for exchange in tree.iter("row"):
+                # answers are ID type 2
+                if int(exchange.attrib["PostTypeId"]) == 2:
+                    # get parent, check if in question_info
+                    parent = int(exchange.attrib["ParentId"])
+                    # note, that parent will be same as tag above in answer_info and question_info
+
+                    # log if parent is in questions (multiple answers for preference model)
+                    if parent in question_info:
+                        # info for answers
+                        ans_text = exchange.attrib["Body"]
+                        ans_score = int(exchange.attrib["Score"])
+                        ans_id = int(exchange.attrib["Id"])  # extra score if this ID matches accept id above
+
+                        # save metadata
+                        if "OwnerUserId" in exchange.attrib:
+                            user_id = int(exchange.attrib["OwnerUserId"])
+                        else:
+                            user_id = -1  # deleted user
+                        # we'll need to store multiple answers per tag
+                        if parent not in answer_info:
+                            answer_info[parent] = {}
+                            answer_info[parent]["Text"] = []
+                            answer_info[parent]["Score"] = []
+                            answer_info[parent]["Id"] = []
+                            answer_info[parent]["Author"] = []
+                            answer_info[parent]["AuthorNames"] = []
+
+                        answer_info[parent]["Text"].append(ans_text)
+                        answer_info[parent]["Score"].append(ans_score)
+                        answer_info[parent]["Id"].append(ans_id)
+                        answer_info[parent]["Author"].append(user_id)  # should fail for some deleted entries
+                        # fix rare case that the username for answer authors is not in the database
+                        if user_id in user_info:
+                            username = user_info[user_id]
+                        else:
+                            username = "(user-not-found)"
+                        answer_info[parent]["AuthorNames"].append(username)
+
+                        if DEBUG:
+                            print_dict(answer_info[parent])
+
+        qa_keys = question_info.keys()
+
+        final_outputs = {"domain": se_sub_name}
+
+        for k in qa_keys:
+            question_data = question_info[k]
+
+            accepted_ans = question_data["AcceptedAnswerId"]
+
+            answer_data = answer_info[k]
+            metadata = question_data["metadata"]
+            date = question_data["Date"]
+
+            # filter for number of unique scores to be >= 2 (per paper)
+            scores = answer_data["Score"]
+            if len(np.unique(scores)) >= 2:
+                answers = []
+                for text, score, ans_id, auth_name, auth_id in zip(
+                    answer_data["Text"], scores, answer_data["Id"], answer_data["AuthorNames"], answer_data["Author"]
+                ):
+                    sub_answer = {}
+                    accepted = accepted_ans == ans_id
+
+                    if score >= 0:
+                        s = round(np.log2(1 + score))
+
+                        # not documented if negative answers can be accepted, assuming no
+                        if accepted:  # add 1 to score if answer was accepted
+                            s += 1
+                    else:
+                        s = -1
+
+                    sub_answer["answer_id"] = ans_id
+                    sub_answer["text"] = text
+                    sub_answer["pm_score"] = s
+                    sub_answer["selected"] = accepted
+                    sub_answer["author"] = auth_name
+                    sub_answer["author_id"] = auth_id
+                    sub_answer["author_profile"] = "https://" + se_full_name + "/users/" + str(auth_id)
+                    answers.append(sub_answer)
+
+            json_obj = {
+                "qid": k,
+                "question": question_data["Body"],
+                "answers": answers,
+                "date": date,
+                "metadata": metadata,
+            }
+            full_section_data.append(json_obj)
+
+        print(f"finished section {se_full_name} at {time.time() - start_time}s")
+
+        if not DEBUG:
+            sublist_len = 100000
+
+            # bypass known issue in arrow https://issues.apache.org/jira/browse/ARROW-17137
+            if len(full_section_data) > sublist_len:
+                print(f"Processed dataset length > {sublist_len}, processing to HF dataset in chunks")
+                chunks = [
+                    full_section_data[x : x + sublist_len] for x in range(0, len(full_section_data), sublist_len)
+                ]
+                ds_chunks = [Dataset.from_list(ch) for ch in chunks]
+                ds = concatenate_datasets(ds_chunks)
+            else:
+                ds = Dataset.from_list(full_section_data)
+
+            save_dataset_shards(ds, save_path, subset=se_full_name, shard_size=shard_size)
diff --git a/data_analysis/stackoverflow/other/main.py b/data_analysis/stackoverflow/other/main.py
new file mode 100644
index 0000000..65a88bb
--- /dev/null
+++ b/data_analysis/stackoverflow/other/main.py
@@ -0,0 +1,195 @@
+# Inspired by https://github.com/huggingface/h4/blob/main/scripts/data/pmp/stack_exchange_process.py
+import datetime
+import os
+import time
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+from datasets import Dataset, concatenate_datasets
+from tqdm import tqdm
+
+# Note: Using rclone + py7zr in command line is often faster than this
+import py7zr
+import requests
+
+# If the cleaning becomes a bottleneck at some point, could be better to use
+# this snippet from Anton https://gist.github.com/anton-l/4bfafb42878a8e77b20f3b844d9cae36
+# (uses selectolax, faster than bs4) instead.
+from bs4 import BeautifulSoup
+from se_reference_utils import ALL_EXCHANGES
+
+
+DATA_DIR = "data/stack-exchange"
+WTOKEN = os.getenv("WTOKEN")
+
+
+def simplify_date(date_string):
+    date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S")
+    return date.strftime("%Y/%m/%d")
+
+
+def download_and_extract_se7z(name: str, directory: str, data_save_dir: str, save_dir_override: str = None):
+    # Downloading 7z file
+    if os.path.exists(f"{data_save_dir}/{name}.7z"):
+        print("Raw 7z data already exists for this dir.")
+    else:
+        print("Downloading compressed data.")
+
+        ex_data_url = f"https://archive.org/download/stackexchange/{directory}"
+        response = requests.get(ex_data_url, allow_redirects=True)
+
+        if response.status_code != 200:
+            raise ConnectionError(f"Request failed: {response.status_code} for subset: {name}, url: {ex_data_url}")
+
+        print("Unpacking raw data.")
+        with open(f"{DATA_DIR}/{name}.7z", "wb") as out:
+            out.write(response.content)
+
+    os.mkdir(f"{DATA_DIR}/{name}")
+    with py7zr.SevenZipFile(f"{DATA_DIR}/{name}.7z", "r") as archive:
+        save_dir = save_dir_override if save_dir_override is not None else name
+        archive.extractall(f"{DATA_DIR}/{save_dir}/")
+
+    print(f"{name} successfully extracted.")
+
+
+def get_question_from_html(exchange):
+    question = {}
+    keys_of_interest = ["Id", "Body", "AnswerCount", "OwnerUserId", "PostScore", "Date", "AcceptedAnswerId"]
+    for key in keys_of_interest:
+        try:
+            if key in ["Id", "AnswerCount", "PostScore", "AcceptedAnswerId", "OwnerUserId"]:
+                question[key] = int(exchange.attrib[key])
+            elif key == "Date":
+                question[key] = simplify_date(exchange.attrib["CreationDate"])
+            elif key == "Body":
+                question[key] = exchange.attrib[key]
+                question["text"] = BeautifulSoup(exchange.attrib[key], "lxml").text
+            else:
+                question[key] = exchange.attrib[key]
+        except KeyError:
+            # deleted user redirect to community page > -1
+            question[key] = -1 if key == "OwnerUserId" else None
+
+    question["metadata"] = [
+        f"https://{se_sub_url}/questions/{str(question['Id'])}",  # question URL
+        f"https://{se_sub_url}",  # Exchange URL
+        f"https://{se_sub_url}/users/{str(question['OwnerUserId'])}/",  # Author URL
+    ]
+
+    return question["Id"], question
+
+
+def get_answer_from_html(exchange):
+    # We connect answers to their parent's id
+    parent_id = int(exchange.attrib["ParentId"])
+
+    answer = {}
+    keys_of_interest = ["Body", "Score", "Id", "OwnerUserId"]
+    for key in keys_of_interest:
+        try:
+            if key in ["Score", "Id", "OwnerUserId"]:
+                answer[key] = int(exchange.attrib[key])
+            elif key == "Body":
+                answer[key] = exchange.attrib[key]
+                answer["text"] = BeautifulSoup(exchange.attrib[key], "lxml").text
+            else:
+                answer[key] = exchange.attrib[key]
+        except KeyError:
+            answer[key] = -1 if key == "OwnerUserId" else None
+
+    return parent_id, answer
+
+
+def get_posts_from_html(se_sub_name):
+    extracted_info = defaultdict(lambda: {"question": None, "answers": list()})
+    with open(f"{DATA_DIR}/{se_sub_name}/Posts.xml", "rb") as f:
+        tree = ET.parse(f)
+
+        for exchange in tree.iter("row"):
+            post_type = int(exchange.attrib["PostTypeId"])
+
+            if post_type == 1:  # Question
+                if int(exchange.attrib["AnswerCount"]) > 0:
+                    tag, question = get_question_from_html(exchange)
+                    extracted_info[tag]["question"] = question
+
+            elif post_type == 2:  # Answer
+                tag, answer = get_answer_from_html(exchange)
+                extracted_info[tag]["answers"].append(answer)
+    return extracted_info
+
+
+def get_jsonlines_from_posts(extracted_info):
+    result_jsonlines = []
+    for tag, data in extracted_info.items():
+        # Sorting answers by score (see LLAMA paper), and only keep positively scored ones
+        question = data["question"]
+        answers = [a for a in sorted(data["answers"], key=lambda x: x["Score"]) if a["Score"] > 0]
+
+        # We skip empty questions or answers
+        if question is None or len(answers) < 1:
+            continue
+
+        text = f"user{question['OwnerUserId']}: {question['text']}"
+        for answer in answers:
+            text += f"\nuser{answer['OwnerUserId']}: {answer['text']}"
+
+        result = {
+            "question_id": question["Id"],
+            "text": text,
+            "metadata": question["metadata"],
+            "date": question["Date"],
+            "original_text": [f"{item['OwnerUserId']}: {item['Body']}" for item in [question] + answers],
+        }
+        result_jsonlines.append(result)
+    return result_jsonlines
+
+
+def upload_to_hub(result_jsonlines):
+    size = len(result_jsonlines)
+    chunk_size = 100000
+    if size > chunk_size:
+        chunks = [
+            Dataset.from_list(result_jsonlines[i : min(i + chunk_size, size)]) for i in range(0, size, chunk_size)
+        ]
+        dataset = concatenate_datasets(chunks)
+    else:
+        dataset = Dataset.from_list(result_jsonlines)
+
+    dataset.push_to_hub("HuggingFaceGECLM/StackExchange_Mar2023", split=se_sub_name, private=True, token=WTOKEN)
+
+
+def main(se_sub_name, se_sub_url):
+    print(f"{se_sub_name} at {se_sub_url}.")
+    start_time = time.time()
+
+    # Download and extract
+    if not os.path.exists(f"{DATA_DIR}/{se_sub_name}/Posts.xml"):
+        if "se_sub_name" == "stackoverflow":
+            # Note: we'll also need -Users.7z if we want to filter on licenses at some point
+            download_and_extract_se7z(
+                se_sub_name, f"{se_sub_url}-Posts.7z", DATA_DIR, save_dir_override="stackoverflow.com"
+            )
+        else:
+            download_and_extract_se7z(se_sub_name, f"{se_sub_url}.7z", DATA_DIR)
+
+    # Selects posts from HTML tree (Questions and answers)
+    extracted_info = get_posts_from_html(se_sub_name)
+    print("Posts parsed from HTML.")
+
+    # Create json from posts
+    result_jsonlines = get_jsonlines_from_posts(extracted_info)
+
+    print(f"Finished {se_sub_url} in {time.time() - start_time}s. Contains {len(result_jsonlines)} lines.")
+
+    # Saves to the hub
+    upload_to_hub(result_jsonlines)
+
+
+if __name__ == "__main__":
+    os.makedirs(DATA_DIR, exist_ok=True)
+
+    # Process all exchanges in a loop - could be easily launched in parallel
+    for se_sub_name, se_sub_url in tqdm(ALL_EXCHANGES.items()):
+        main(se_sub_name, se_sub_url)
diff --git a/data_analysis/stackoverflow/other/requirements.txt b/data_analysis/stackoverflow/other/requirements.txt
new file mode 100644
index 0000000..80efabe
--- /dev/null
+++ b/data_analysis/stackoverflow/other/requirements.txt
@@ -0,0 +1,5 @@
+datasets
+py7zr
+requests
+tqdm
+bs4
\ No newline at end of file
diff --git a/data_analysis/stackoverflow/other/se_reference_utils.py b/data_analysis/stackoverflow/other/se_reference_utils.py
new file mode 100644
index 0000000..a9cc434
--- /dev/null
+++ b/data_analysis/stackoverflow/other/se_reference_utils.py
@@ -0,0 +1,347 @@
+ALL_EXCHANGES = {
+    "3dprinting.meta": "3dprinting.meta.stackexchange.com",
+    "3dprinting": "3dprinting.stackexchange.com",
+    "academia.meta": "academia.meta.stackexchange.com",
+    "academia": "academia.stackexchange.com",
+    "ai.meta": "ai.meta.stackexchange.com",
+    "ai": "ai.stackexchange.com",
+    "android.meta": "android.meta.stackexchange.com",
+    "android": "android.stackexchange.com",
+    "anime.meta": "anime.meta.stackexchange.com",
+    "anime": "anime.stackexchange.com",
+    "apple.meta": "apple.meta.stackexchange.com",
+    "apple": "apple.stackexchange.com",
+    "arduino.meta": "arduino.meta.stackexchange.com",
+    "arduino": "arduino.stackexchange.com",
+    "askubuntu": "askubuntu.com",
+    "astronomy": "astronomy.stackexchange.com",
+    "astronomy.meta": "astronomy.meta.stackexchange.com",
+    "aviation": "aviation.stackexchange.com",
+    "aviation.meta": "aviation.meta.stackexchange.com",
+    "avp": "avp.stackexchange.com",
+    "avp.meta": "avp.meta.stackexchange.com",
+    "beer": "beer.stackexchange.com",
+    "beer.meta": "beer.meta.stackexchange.com",
+    "bicycles": "bicycles.stackexchange.com",
+    "bicycles.meta": "bicycles.meta.stackexchange.com",
+    "bioinformatics": "bioinformatics.stackexchange.com",
+    "bioinformatics.meta": "bioinformatics.meta.stackexchange.com",
+    "biology": "biology.stackexchange.com",
+    "biology.meta": "biology.meta.stackexchange.com",
+    "bitcoin": "bitcoin.stackexchange.com",
+    "bitcoin.meta": "bitcoin.meta.stackexchange.com",
+    "blender": "blender.stackexchange.com",
+    "blender.meta": "blender.meta.stackexchange.com",
+    "boardgames": "boardgames.stackexchange.com",
+    "boardgames.meta": "boardgames.meta.stackexchange.com",
+    "bricks": "bricks.stackexchange.com",
+    "bricks.meta": "bricks.meta.stackexchange.com",
+    "buddhism": "buddhism.stackexchange.com",
+    "buddhism.meta": "buddhism.meta.stackexchange.com",
+    "cardano": "cardano.stackexchange.com",
+    "cardano.meta": "cardano.meta.stackexchange.com",
+    "chemistry": "chemistry.stackexchange.com",
+    "chemistry.meta": "chemistry.meta.stackexchange.com",
+    "chess": "chess.stackexchange.com",
+    "chess.meta": "chess.meta.stackexchange.com",
+    "chinese": "chinese.stackexchange.com",
+    "chinese.meta": "chinese.meta.stackexchange.com",
+    "christianity": "christianity.stackexchange.com",
+    "christianity.meta": "christianity.meta.stackexchange.com",
+    "civicrm": "civicrm.stackexchange.com",
+    "civicrm.meta": "civicrm.meta.stackexchange.com",
+    "codegolf": "codegolf.stackexchange.com",
+    "codegolf.meta": "codegolf.meta.stackexchange.com",
+    "codereview": "codereview.stackexchange.com",
+    "codereview.meta": "codereview.meta.stackexchange.com",
+    "coffee": "coffee.stackexchange.com",
+    "coffee.meta": "coffee.meta.stackexchange.com",
+    "cogsci": "cogsci.stackexchange.com",
+    "cogsci.meta": "cogsci.meta.stackexchange.com",
+    "computergraphics": "computergraphics.stackexchange.com",
+    "computergraphics.meta": "computergraphics.meta.stackexchange.com",
+    "conlang": "conlang.stackexchange.com",
+    "conlang.meta": "conlang.meta.stackexchange.com",
+    "cooking": "cooking.stackexchange.com",
+    "cooking.meta": "cooking.meta.stackexchange.com",
+    "craftcms": "craftcms.stackexchange.com",
+    "craftcms.meta": "craftcms.meta.stackexchange.com",
+    "crafts": "crafts.stackexchange.com",
+    "crafts.meta": "crafts.meta.stackexchange.com",
+    "crypto": "crypto.stackexchange.com",
+    "crypto.meta": "crypto.meta.stackexchange.com",
+    "cs": "cs.stackexchange.com",
+    "cs.meta": "cs.meta.stackexchange.com",
+    "cseducators": "cseducators.stackexchange.com",
+    "cseducators.meta": "cseducators.meta.stackexchange.com",
+    "cstheory": "cstheory.stackexchange.com",
+    "cstheory.meta": "cstheory.meta.stackexchange.com",
+    "datascience": "datascience.stackexchange.com",
+    "datascience.meta": "datascience.meta.stackexchange.com",
+    "dba": "dba.stackexchange.com",
+    "dba.meta": "dba.meta.stackexchange.com",
+    "devops": "devops.stackexchange.com",
+    "devops.meta": "devops.meta.stackexchange.com",
+    "diy": "diy.stackexchange.com",
+    "diy.meta": "diy.meta.stackexchange.com",
+    "drones": "drones.stackexchange.com",
+    "drones.meta": "drones.meta.stackexchange.com",
+    "drupal": "drupal.stackexchange.com",
+    "drupal.meta": "drupal.meta.stackexchange.com",
+    "dsp": "dsp.stackexchange.com",
+    "dsp.meta": "dsp.meta.stackexchange.com",
+    "earthscience": "earthscience.stackexchange.com",
+    "earthscience.meta": "earthscience.meta.stackexchange.com",
+    "ebooks": "ebooks.stackexchange.com",
+    "ebooks.meta": "ebooks.meta.stackexchange.com",
+    "economics": "economics.stackexchange.com",
+    "economics.meta": "economics.meta.stackexchange.com",
+    "electronics": "electronics.stackexchange.com",
+    "electronics.meta": "electronics.meta.stackexchange.com",
+    "elementaryos": "elementaryos.stackexchange.com",
+    "elementaryos.meta": "elementaryos.meta.stackexchange.com",
+    "ell": "ell.stackexchange.com",
+    "ell.meta": "ell.meta.stackexchange.com",
+    "emacs": "emacs.stackexchange.com",
+    "emacs.meta": "emacs.meta.stackexchange.com",
+    "engineering": "engineering.stackexchange.com",
+    "engineering.meta": "engineering.meta.stackexchange.com",
+    "english": "english.stackexchange.com",
+    "english.meta": "english.meta.stackexchange.com",
+    "eosio": "eosio.stackexchange.com",
+    "eosio.meta": "eosio.meta.stackexchange.com",
+    "esperanto": "esperanto.stackexchange.com",
+    "esperanto.meta": "esperanto.meta.stackexchange.com",
+    "ethereum": "ethereum.stackexchange.com",
+    "ethereum.meta": "ethereum.meta.stackexchange.com",
+    "expatriates": "expatriates.stackexchange.com",
+    "expatriates.meta": "expatriates.meta.stackexchange.com",
+    "expressionengine": "expressionengine.stackexchange.com",
+    "expressionengine.meta": "expressionengine.meta.stackexchange.com",
+    "fitness": "fitness.stackexchange.com",
+    "fitness.meta": "fitness.meta.stackexchange.com",
+    "freelancing": "freelancing.stackexchange.com",
+    "freelancing.meta": "freelancing.meta.stackexchange.com",
+    "french": "french.stackexchange.com",
+    "french.meta": "french.meta.stackexchange.com",
+    "gamedev": "gamedev.stackexchange.com",
+    "gamedev.meta": "gamedev.meta.stackexchange.com",
+    "gaming": "gaming.stackexchange.com",
+    "gaming.meta": "gaming.meta.stackexchange.com",
+    "gardening": "gardening.stackexchange.com",
+    "gardening.meta": "gardening.meta.stackexchange.com",
+    "genealogy": "genealogy.stackexchange.com",
+    "genealogy.meta": "genealogy.meta.stackexchange.com",
+    "german": "german.stackexchange.com",
+    "german.meta": "german.meta.stackexchange.com",
+    "gis": "gis.stackexchange.com",
+    "gis.meta": "gis.meta.stackexchange.com",
+    "graphicdesign": "graphicdesign.stackexchange.com",
+    "graphicdesign.meta": "graphicdesign.meta.stackexchange.com",
+    "ham": "ham.stackexchange.com",
+    "ham.meta": "ham.meta.stackexchange.com",
+    "hardwarerecs": "hardwarerecs.stackexchange.com",
+    "hardwarerecs.meta": "hardwarerecs.meta.stackexchange.com",
+    "health": "health.stackexchange.com",
+    "health.meta": "health.meta.stackexchange.com",
+    "hermeneutics": "hermeneutics.stackexchange.com",
+    "hermeneutics.meta": "hermeneutics.meta.stackexchange.com",
+    "hinduism": "hinduism.stackexchange.com",
+    "hinduism.meta": "hinduism.meta.stackexchange.com",
+    "history": "history.stackexchange.com",
+    "history.meta": "history.meta.stackexchange.com",
+    "homebrew": "homebrew.stackexchange.com",
+    "homebrew.meta": "homebrew.meta.stackexchange.com",
+    "hsm": "hsm.stackexchange.com",
+    "hsm.meta": "hsm.meta.stackexchange.com",
+    "interpersonal": "interpersonal.stackexchange.com",
+    "interpersonal.meta": "interpersonal.meta.stackexchange.com",
+    "iot": "iot.stackexchange.com",
+    "iot.meta": "iot.meta.stackexchange.com",
+    "iota": "iota.stackexchange.com",
+    "iota.meta": "iota.meta.stackexchange.com",
+    "islam": "islam.stackexchange.com",
+    "islam.meta": "islam.meta.stackexchange.com",
+    "italian": "italian.stackexchange.com",
+    "italian.meta": "italian.meta.stackexchange.com",
+    "japanese": "japanese.stackexchange.com",
+    "japanese.meta": "japanese.meta.stackexchange.com",
+    "joomla": "joomla.stackexchange.com",
+    "joomla.meta": "joomla.meta.stackexchange.com",
+    "judaism": "judaism.stackexchange.com",
+    "judaism.meta": "judaism.meta.stackexchange.com",
+    "korean": "korean.stackexchange.com",
+    "korean.meta": "korean.meta.stackexchange.com",
+    "languagelearning": "languagelearning.stackexchange.com",
+    "languagelearning.meta": "languagelearning.meta.stackexchange.com",
+    "latin": "latin.stackexchange.com",
+    "latin.meta": "latin.meta.stackexchange.com",
+    "law": "law.stackexchange.com",
+    "law.meta": "law.meta.stackexchange.com",
+    "lifehacks": "lifehacks.stackexchange.com",
+    "lifehacks.meta": "lifehacks.meta.stackexchange.com",
+    "linguistics": "linguistics.stackexchange.com",
+    "linguistics.meta": "linguistics.meta.stackexchange.com",
+    "literature": "literature.stackexchange.com",
+    "literature.meta": "literature.meta.stackexchange.com",
+    "magento": "magento.stackexchange.com",
+    "magento.meta": "magento.meta.stackexchange.com",
+    "martialarts": "martialarts.stackexchange.com",
+    "martialarts.meta": "martialarts.meta.stackexchange.com",
+    "materials": "materials.stackexchange.com",
+    "materials.meta": "materials.meta.stackexchange.com",
+    "math": "math.stackexchange.com",
+    "math.meta": "math.meta.stackexchange.com",
+    "matheducators": "matheducators.stackexchange.com",
+    "matheducators.meta": "matheducators.meta.stackexchange.com",
+    "mathematica": "mathematica.stackexchange.com",
+    "mathematica.meta": "mathematica.meta.stackexchange.com",
+    "mathoverflow": "mathoverflow.net",
+    "mechanics.meta": "mechanics.meta.stackexchange.com",
+    "mechanics": "mechanics.stackexchange.com",
+    "meta.askubuntu": "meta.askubuntu.com",
+    "meta.mathoverflow": "meta.mathoverflow.net",
+    "meta.serverfault": "meta.serverfault.com",
+    "meta.stackexchange": "meta.stackexchange.com",
+    "meta.stackoverflow": "meta.stackoverflow.com",
+    "meta.superuser": "meta.superuser.com",
+    "moderators.meta": "moderators.meta.stackexchange.com",
+    "moderators": "moderators.stackexchange.com",
+    "monero.meta": "monero.meta.stackexchange.com",
+    "monero": "monero.stackexchange.com",
+    "money.meta": "money.meta.stackexchange.com",
+    "money": "money.stackexchange.com",
+    "movies.meta": "movies.meta.stackexchange.com",
+    "movies": "movies.stackexchange.com",
+    "music.meta": "music.meta.stackexchange.com",
+    "music": "music.stackexchange.com",
+    "musicfans.meta": "musicfans.meta.stackexchange.com",
+    "musicfans": "musicfans.stackexchange.com",
+    "mythology.meta": "mythology.meta.stackexchange.com",
+    "mythology": "mythology.stackexchange.com",
+    "networkengineering.meta": "networkengineering.meta.stackexchange.com",
+    "networkengineering": "networkengineering.stackexchange.com",
+    "opendata.meta": "opendata.meta.stackexchange.com",
+    "opendata": "opendata.stackexchange.com",
+    "opensource.meta": "opensource.meta.stackexchange.com",
+    "opensource": "opensource.stackexchange.com",
+    "or.meta": "or.meta.stackexchange.com",
+    "or": "or.stackexchange.com",
+    "outdoors.meta": "outdoors.meta.stackexchange.com",
+    "outdoors": "outdoors.stackexchange.com",
+    "parenting.meta": "parenting.meta.stackexchange.com",
+    "parenting": "parenting.stackexchange.com",
+    "patents.meta": "patents.meta.stackexchange.com",
+    "patents": "patents.stackexchange.com",
+    "pets.meta": "pets.meta.stackexchange.com",
+    "pets": "pets.stackexchange.com",
+    "philosophy.meta": "philosophy.meta.stackexchange.com",
+    "philosophy": "philosophy.stackexchange.com",
+    "photo.meta": "photo.meta.stackexchange.com",
+    "photo": "photo.stackexchange.com",
+    "physics.meta": "physics.meta.stackexchange.com",
+    "physics": "physics.stackexchange.com",
+    "pm.meta": "pm.meta.stackexchange.com",
+    "pm": "pm.stackexchange.com",
+    "poker.meta": "poker.meta.stackexchange.com",
+    "poker": "poker.stackexchange.com",
+    "politics.meta": "politics.meta.stackexchange.com",
+    "politics": "politics.stackexchange.com",
+    "portuguese.meta": "portuguese.meta.stackexchange.com",
+    "portuguese": "portuguese.stackexchange.com",
+    "puzzling.meta": "puzzling.meta.stackexchange.com",
+    "puzzling": "puzzling.stackexchange.com",
+    "quant.meta": "quant.meta.stackexchange.com",
+    "quant": "quant.stackexchange.com",
+    "quantumcomputing.meta": "quantumcomputing.meta.stackexchange.com",
+    "quantumcomputing": "quantumcomputing.stackexchange.com",
+    "raspberrypi.meta": "raspberrypi.meta.stackexchange.com",
+    "raspberrypi": "raspberrypi.stackexchange.com",
+    "retrocomputing.meta": "retrocomputing.meta.stackexchange.com",
+    "retrocomputing": "retrocomputing.stackexchange.com",
+    "reverseengineering.meta": "reverseengineering.meta.stackexchange.com",
+    "reverseengineering": "reverseengineering.stackexchange.com",
+    "robotics.meta": "robotics.meta.stackexchange.com",
+    "robotics": "robotics.stackexchange.com",
+    "rpg.meta": "rpg.meta.stackexchange.com",
+    "rpg": "rpg.stackexchange.com",
+    "rus.meta": "rus.meta.stackexchange.com",
+    "rus": "rus.stackexchange.com",
+    "russian.meta": "russian.meta.stackexchange.com",
+    "russian": "russian.stackexchange.com",
+    "salesforce.meta": "salesforce.meta.stackexchange.com",
+    "salesforce": "salesforce.stackexchange.com",
+    "scicomp.meta": "scicomp.meta.stackexchange.com",
+    "scicomp": "scicomp.stackexchange.com",
+    "scifi.meta": "scifi.meta.stackexchange.com",
+    "scifi": "scifi.stackexchange.com",
+    "security.meta": "security.meta.stackexchange.com",
+    "security": "security.stackexchange.com",
+    "serverfault": "serverfault.com",
+    "sharepoint": "sharepoint.stackexchange.com",
+    "sharepoint.meta": "sharepoint.meta.stackexchange.com",
+    "sitecore": "sitecore.stackexchange.com",
+    "sitecore.meta": "sitecore.meta.stackexchange.com",
+    "skeptics": "skeptics.stackexchange.com",
+    "skeptics.meta": "skeptics.meta.stackexchange.com",
+    "softwareengineering": "softwareengineering.stackexchange.com",
+    "softwareengineering.meta": "softwareengineering.meta.stackexchange.com",
+    "softwarerecs": "softwarerecs.stackexchange.com",
+    "softwarerecs.meta": "softwarerecs.meta.stackexchange.com",
+    "sound": "sound.stackexchange.com",
+    "sound.meta": "sound.meta.stackexchange.com",
+    "space": "space.stackexchange.com",
+    "space.meta": "space.meta.stackexchange.com",
+    "spanish": "spanish.stackexchange.com",
+    "spanish.meta": "spanish.meta.stackexchange.com",
+    "sports": "sports.stackexchange.com",
+    "sports.meta": "sports.meta.stackexchange.com",
+    "sqa": "sqa.stackexchange.com",
+    "sqa.meta": "sqa.meta.stackexchange.com",
+    "stackapps": "stackapps.com",
+    # "stackexchange": "stackexchange.com",
+    "stats.meta": "stats.meta.stackexchange.com",
+    "stats": "stats.stackexchange.com",
+    "stellar.meta": "stellar.meta.stackexchange.com",
+    "stellar": "stellar.stackexchange.com",
+    "superuser": "superuser.com",
+    "sustainability": "sustainability.stackexchange.com",
+    "sustainability.meta": "sustainability.meta.stackexchange.com",
+    "tex": "tex.stackexchange.com",
+    "tex.meta": "tex.meta.stackexchange.com",
+    "tezos": "tezos.stackexchange.com",
+    "tezos.meta": "tezos.meta.stackexchange.com",
+    "tor": "tor.stackexchange.com",
+    "tor.meta": "tor.meta.stackexchange.com",
+    "travel": "travel.stackexchange.com",
+    "travel.meta": "travel.meta.stackexchange.com",
+    "tridion": "tridion.stackexchange.com",
+    "tridion.meta": "tridion.meta.stackexchange.com",
+    "ukrainian": "ukrainian.stackexchange.com",
+    "ukrainian.meta": "ukrainian.meta.stackexchange.com",
+    "unix": "unix.stackexchange.com",
+    "unix.meta": "unix.meta.stackexchange.com",
+    "ux": "ux.stackexchange.com",
+    "ux.meta": "ux.meta.stackexchange.com",
+    "vegetarianism": "vegetarianism.stackexchange.com",
+    "vegetarianism.meta": "vegetarianism.meta.stackexchange.com",
+    "vi": "vi.stackexchange.com",
+    "vi.meta": "vi.meta.stackexchange.com",
+    "webapps": "webapps.stackexchange.com",
+    "webapps.meta": "webapps.meta.stackexchange.com",
+    "webmasters": "webmasters.stackexchange.com",
+    "webmasters.meta": "webmasters.meta.stackexchange.com",
+    "windowsphone": "windowsphone.stackexchange.com",
+    "windowsphone.meta": "windowsphone.meta.stackexchange.com",
+    "woodworking": "woodworking.stackexchange.com",
+    "woodworking.meta": "woodworking.meta.stackexchange.com",
+    "wordpress": "wordpress.stackexchange.com",
+    "wordpress.meta": "wordpress.meta.stackexchange.com",
+    "workplace": "workplace.stackexchange.com",
+    "workplace.meta": "workplace.meta.stackexchange.com",
+    "worldbuilding": "worldbuilding.stackexchange.com",
+    "worldbuilding.meta": "worldbuilding.meta.stackexchange.com",
+    "writers": "writers.stackexchange.com",
+    "writers.meta": "writers.meta.stackexchange.com",
+    "stackoverflow": "stackoverflow.com",
+}

From 58dde1740f619530b0e2d36d66c26f5740ce4119 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Wed, 20 Sep 2023 15:36:58 +0200
Subject: [PATCH 2/5] add code

---
 data_analysis/stackoverflow/README.md         | 3 +++
 data_analysis/stackoverflow/h4_code/README.md | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 data_analysis/stackoverflow/README.md

diff --git a/data_analysis/stackoverflow/README.md b/data_analysis/stackoverflow/README.md
new file mode 100644
index 0000000..089cc28
--- /dev/null
+++ b/data_analysis/stackoverflow/README.md
@@ -0,0 +1,3 @@
+## Code for processing StackExchange data
+
+Code for processing stackexchange data dump available in `h4_code` (to build https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) and `other`, notebook for further processing (e.g convert all HTML to Markdown) in `StackExchangeProcessing.ipynb` (to build https://huggingface.co/datasets/lvwerra/stack-exchange-paired)
diff --git a/data_analysis/stackoverflow/h4_code/README.md b/data_analysis/stackoverflow/h4_code/README.md
index e918771..d3e3ed4 100644
--- a/data_analysis/stackoverflow/h4_code/README.md
+++ b/data_analysis/stackoverflow/h4_code/README.md
@@ -20,7 +20,7 @@ It is a long for-loop over desired exchanges.
 python scripts/data/pmp/stack_exchange_process.py --save_path=/path/to/hf-dataset
 ```
 
-3) `binarize.py`: used to binarize the pre-filter Stack Exchange data (and in the future, Reddit / Wikipedia)
+3) `binarize.py`: used to binarize the pre-filter Stack Exchange data
 ```shell
 python scripts/data/pmp/binarize.py --save_path=/path/to/hf-dataset
 ```
\ No newline at end of file

From 3ef2a868d7ccd6d9bb699c0e9a63a142f405eb2e Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Wed, 20 Sep 2023 15:38:27 +0200
Subject: [PATCH 3/5] add notebook

---
 .../StackExchangeProcessing.ipynb             | 718 ++++++++++++++++++
 1 file changed, 718 insertions(+)
 create mode 100644 data_analysis/stackoverflow/StackExchangeProcessing.ipynb

diff --git a/data_analysis/stackoverflow/StackExchangeProcessing.ipynb b/data_analysis/stackoverflow/StackExchangeProcessing.ipynb
new file mode 100644
index 0000000..d37df68
--- /dev/null
+++ b/data_analysis/stackoverflow/StackExchangeProcessing.ipynb
@@ -0,0 +1,718 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "7821c501-8c5d-4af6-81cd-caa6ad0bd58c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, DatasetDict\n",
+    "from datasets import concatenate_datasets\n",
+    "from IPython.display import HTML\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "import re \n",
+    "import numpy as np\n",
+    "from markdownify import markdownify as md"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "dc821970-efdb-407f-bd79-59da09323280",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset parquet (/home/leandro/.cache/huggingface/datasets/HuggingFaceH4___parquet/HuggingFaceH4--stack-exchange-preferences-1d2bff9ecb5ffe2a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['qid', 'question', 'answers', 'date', 'metadata'],\n",
+       "    num_rows: 10807695\n",
+       "})"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds = load_dataset(\"HuggingFaceH4/stack-exchange-preferences\", split=\"train\", num_proc=16)\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "0d8d8729-6d6b-4791-a24a-cb112c399bd0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<p>I have been wanting to learn about 3D printing a long time so I really want this site to succeed but I have no previous experience with the subject. </p>\n",
+       "\n",
+       "<p>I was wondering how can I help the site at this early stage. I thought about asking about how to get started with 3D printing but SE explicitly discourages \"easy\" questions in the private beta.</p>\n",
+       "\n",
+       "<p>What can newbies like me do for the site at this stage besides voting questions and answers?</p>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "HTML(ds[0][\"question\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "b3b60caa-3bd9-4033-ab1c-90c5b08ef3ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lang_callback(el):\n",
+    "    lang = el['class'][0] if el.has_attr('class') else None\n",
+    "    \n",
+    "    if not lang is None:\n",
+    "        lang = lang.split(\"-\")[-1]\n",
+    "    return lang"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "de1123a0-7468-4d13-a8d3-4011ace36c3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def html2md(text):\n",
+    "    text = md(text, code_language_callback=lang_callback)\n",
+    "    text = re.sub(r\"\\n\\s*\\n\", \"\\n\\n\", text).strip()\n",
+    "    return text.encode('utf-8', 'replace').decode()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "c9da64a0-c753-4d35-9369-b70a7a9fa2f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "I have been wanting to learn about 3D printing a long time so I really want this site to succeed but I have no previous experience with the subject. \n",
+      "\n",
+      "I was wondering how can I help the site at this early stage. I thought about asking about how to get started with 3D printing but SE explicitly discourages \"easy\" questions in the private beta.\n",
+      "\n",
+      "What can newbies like me do for the site at this stage besides voting questions and answers?\n",
+      "====================\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(1):\n",
+    "    text = html2md(ds[i][\"question\"])\n",
+    "    print(text)\n",
+    "    print(\"==\"*10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "3bf33a2f-fed5-49e7-8046-e813ad172b17",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "49.935"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean([len(ds[i][\"answers\"])*(len(ds[i][\"answers\"])-1)/2 for i in range(10000)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "id": "88ea2dd5-b885-4f65-bae3-1319c7816044",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = ds.shuffle(seed=42)\n",
+    "index = list(range(len(ds)))\n",
+    "\n",
+    "ds_splits = DatasetDict({\n",
+    "    \"finetune\": ds.select(index[:3_000_000]),\n",
+    "    \"reward\": ds.select(index[3_000_000:6_000_000]),\n",
+    "    \"rl\": ds.select(index[6_000_000:9_000_000]),\n",
+    "    \"evaluation\": ds.select(index[9_000_000:]),\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "1607922d-f585-4de7-be70-2205b5170102",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    finetune: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata'],\n",
+       "        num_rows: 3000000\n",
+       "    })\n",
+       "    reward: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata'],\n",
+       "        num_rows: 3000000\n",
+       "    })\n",
+       "    rl: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata'],\n",
+       "        num_rows: 3000000\n",
+       "    })\n",
+       "    evaluation: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata'],\n",
+       "        num_rows: 1807695\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 88,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds_splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "edc8af18-94a5-49e9-ae73-ce4ba81d9739",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def binary_comparison(answers):\n",
+    "    \"\"\"Returns tuples of answers, first always best\"\"\"\n",
+    "    pairs = []\n",
+    "    \n",
+    "    for i in range(len(answers)-1):\n",
+    "        for j in range(i+1, len(answers)):\n",
+    "            if answers[i][\"pm_score\"]>answers[j][\"pm_score\"]:\n",
+    "                pairs.append((answers[i][\"text\"], answers[j][\"text\"]))\n",
+    "            elif answers[i][\"pm_score\"]<answers[j][\"pm_score\"]:\n",
+    "                pairs.append((answers[j][\"text\"], answers[i][\"text\"]))\n",
+    "    return pairs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "88afe90e-364e-4b21-898b-1c6ceb9cfd32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess(examples):\n",
+    "    \"\"\"Cleans HTML and returns paired answers (j is better than k). Note that this returns more examples (one for each pair per question).\"\"\"\n",
+    "    \n",
+    "    MAX_PAIRS_PER_QUESTION = 10\n",
+    "    n_samples = len(examples[\"qid\"])\n",
+    "    \n",
+    "    # initialize empty lists for new samples\n",
+    "    new_examples = {\"question\": [], \"response_j\": [], \"response_k\": []}\n",
+    "    for key in examples:\n",
+    "        new_examples[key] = []\n",
+    "    \n",
+    "    for sample_id in range(n_samples):\n",
+    "        # get pairs where first is always the better one\n",
+    "        pairs = binary_comparison(examples[\"answers\"][sample_id])\n",
+    "        n_answers = len(examples[\"answers\"][sample_id])\n",
+    "        \n",
+    "        # sample if we get more pairs than maximum\n",
+    "        if len(pairs) > MAX_PAIRS_PER_QUESTION:\n",
+    "            indices = np.random.choice(list(range(len(pairs))), MAX_PAIRS_PER_QUESTION, replace=False)\n",
+    "            pairs = [pairs[i] for i in indices]\n",
+    "        \n",
+    "        # construct the samples\n",
+    "        for pair in pairs:\n",
+    "            for key in examples:\n",
+    "                if key==\"question\":\n",
+    "                    new_examples[key].append(html2md(examples[key][sample_id]))\n",
+    "                else:\n",
+    "                    new_examples[key].append(examples[key][sample_id])\n",
+    "            new_examples[\"response_j\"].append(html2md(pair[0]))\n",
+    "            new_examples[\"response_k\"].append(html2md(pair[1]))\n",
+    "    return new_examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "ac06aac5-3953-4321-9f1e-6ff210bee82d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=60):   0%|          | 0/3000000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/jupyter/lib/python3.8/site-packages/bs4/__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=60):   0%|          | 0/3000000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/jupyter/lib/python3.8/site-packages/bs4/__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=60):   0%|          | 0/3000000 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/jupyter/lib/python3.8/site-packages/bs4/__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/envs/jupyter/lib/python3.8/site-packages/bs4/__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/envs/jupyter/lib/python3.8/site-packages/bs4/__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Map (num_proc=60):   0%|          | 0/1807695 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/jupyter/lib/python3.8/site-packages/bs4/__init__.py:435: MarkupResemblesLocatorWarning: The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds_result = ds_splits.map(preprocess, batch_size=1000, batched=True, num_proc=60)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "id": "06e3d891-ffde-4762-95d5-39658a1127ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    finetune: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 7440923\n",
+       "    })\n",
+       "    reward: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 7441998\n",
+       "    })\n",
+       "    rl: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 7435908\n",
+       "    })\n",
+       "    evaluation: Dataset({\n",
+       "        features: ['qid', 'question', 'answers', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 4483004\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 93,
+   "id": "631416dc-cf19-485d-a2f3-94c9b2cb2bfc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'qid': 12891264,\n",
+       " 'question': \"I am using jQuery fileupload plugin and I want to do some custom jQuery stuff once fileupload is done\\n\\nfrom here <https://github.com/blueimp/jQuery-File-Upload/wiki/Options>\\n\\nNow it says this\\n\\n```\\nCallback for successful upload requests.\\n$('#fileupload')\\n       .bind('fileuploaddone', function (e, data) {/* ... */})\\n\\n```\\n\\nNow I have defined this custom function for testing in my own js file\\n\\n```\\n$('#fileupload').bind('fileuploaddone', function (e, data) {/* ... */\\nalert('Hello');\\n})\\n\\n```\\n\\nBut it's not working.\\n\\nBut if I edit the main file in here\\n\\n```\\n  // Callback for successful uploads:\\n            done: function (e, data) {\\n\\n```\\n\\nThen it works.\",\n",
+       " 'answers': [{'answer_id': 12891484,\n",
+       "   'author': 'Reflective',\n",
+       "   'author_id': 1686626,\n",
+       "   'author_profile': 'https://Stackoverflow.com/users/1686626',\n",
+       "   'pm_score': 4,\n",
+       "   'selected': True,\n",
+       "   'text': \"<p>Looking at the library code, seems all events are renamed removing 'fileupload' ... so 'fileuploaddone' becomes just 'done'. It is valid for all other callbacks.\\nlook at this section:</p>\\n\\n<pre><code>    // Other callbacks:\\n    // Callback for the submit event of each file upload:\\n    // submit: function (e, data) {}, // .bind('fileuploadsubmit', func);\\n    // Callback for the start of each file upload request:\\n    // send: function (e, data) {}, // .bind('fileuploadsend', func);\\n    // Callback for successful uploads:\\n    // done: function (e, data) {}, // .bind('fileuploaddone', func);\\n    // Callback for failed (abort or error) uploads:\\n    // fail: function (e, data) {}, // .bind('fileuploadfail', func);\\n    // Callback for completed (success, abort or error) requests:\\n    // always: function (e, data) {}, // .bind('fileuploadalways', func);\\n    // Callback for upload progress events:\\n    // progress: function (e, data) {}, // .bind('fileuploadprogress', func);\\n    // Callback for global upload progress events:\\n    // progressall: function (e, data) {}, // .bind('fileuploadprogressall', func);\\n    // Callback for uploads start, equivalent to the global ajaxStart event:\\n    // start: function (e) {}, // .bind('fileuploadstart', func);\\n    // Callback for uploads stop, equivalent to the global ajaxStop event:\\n    // stop: function (e) {}, // .bind('fileuploadstop', func);\\n    // Callback for change events of the fileInput(s):\\n    // change: function (e, data) {}, // .bind('fileuploadchange', func);\\n    // Callback for paste events to the pasteZone(s):\\n    // paste: function (e, data) {}, // .bind('fileuploadpaste', func);\\n    // Callback for drop events of the dropZone(s):\\n    // drop: function (e, data) {}, // .bind('fileuploaddrop', func);\\n    // Callback for dragover events of the dropZone(s):\\n    // dragover: function (e) {}, // .bind('fileuploaddragover', func);\\n</code></pre>\\n\\n<p>If you have some doubts about what's happening, just look at the code inside. This library is not compressed so it is easy to see. for example</p>\\n\\n<pre><code>// start: function (e) {}, // .bind('fileuploadstart', func);\\n</code></pre>\\n\\n<p><code>start</code> callback is implemented. <code>fileuploadstart</code> is not.</p>\\n\"},\n",
+       "  {'answer_id': 15419140,\n",
+       "   'author': 'NXT',\n",
+       "   'author_id': 1554649,\n",
+       "   'author_profile': 'https://Stackoverflow.com/users/1554649',\n",
+       "   'pm_score': 3,\n",
+       "   'selected': False,\n",
+       "   'text': '<p>Check if the server-side uploading script returns a JSON reply - in my case it didn\\'t work when the reply was empty, but file was uploaded successfully.</p>\\n\\n<p>So, below is working for me with jQuery 1.9.1 and the newest version of the \"jQuery File Upload Plugin\" - 5.21.3</p>\\n\\n<pre><code>$(\"#fileupload\").bind(\"fileuploaddone\", function (e, data) {\\n    console.log(\"fileuploaddone event fired\");\\n});\\n</code></pre>\\n'}],\n",
+       " 'date': '2012/10/15',\n",
+       " 'metadata': ['https://Stackoverflow.com/questions/12891264',\n",
+       "  'https://Stackoverflow.com',\n",
+       "  'https://Stackoverflow.com/users/767244/'],\n",
+       " 'response_j': \"Looking at the library code, seems all events are renamed removing 'fileupload' ... so 'fileuploaddone' becomes just 'done'. It is valid for all other callbacks.\\nlook at this section:\\n\\n```\\n    // Other callbacks:\\n    // Callback for the submit event of each file upload:\\n    // submit: function (e, data) {}, // .bind('fileuploadsubmit', func);\\n    // Callback for the start of each file upload request:\\n    // send: function (e, data) {}, // .bind('fileuploadsend', func);\\n    // Callback for successful uploads:\\n    // done: function (e, data) {}, // .bind('fileuploaddone', func);\\n    // Callback for failed (abort or error) uploads:\\n    // fail: function (e, data) {}, // .bind('fileuploadfail', func);\\n    // Callback for completed (success, abort or error) requests:\\n    // always: function (e, data) {}, // .bind('fileuploadalways', func);\\n    // Callback for upload progress events:\\n    // progress: function (e, data) {}, // .bind('fileuploadprogress', func);\\n    // Callback for global upload progress events:\\n    // progressall: function (e, data) {}, // .bind('fileuploadprogressall', func);\\n    // Callback for uploads start, equivalent to the global ajaxStart event:\\n    // start: function (e) {}, // .bind('fileuploadstart', func);\\n    // Callback for uploads stop, equivalent to the global ajaxStop event:\\n    // stop: function (e) {}, // .bind('fileuploadstop', func);\\n    // Callback for change events of the fileInput(s):\\n    // change: function (e, data) {}, // .bind('fileuploadchange', func);\\n    // Callback for paste events to the pasteZone(s):\\n    // paste: function (e, data) {}, // .bind('fileuploadpaste', func);\\n    // Callback for drop events of the dropZone(s):\\n    // drop: function (e, data) {}, // .bind('fileuploaddrop', func);\\n    // Callback for dragover events of the dropZone(s):\\n    // dragover: function (e) {}, // .bind('fileuploaddragover', func);\\n\\n```\\n\\nIf you have some doubts about what's happening, just look at the code inside. This library is not compressed so it is easy to see. for example\\n\\n```\\n// start: function (e) {}, // .bind('fileuploadstart', func);\\n\\n```\\n\\n`start` callback is implemented. `fileuploadstart` is not.\",\n",
+       " 'response_k': 'Check if the server-side uploading script returns a JSON reply - in my case it didn\\'t work when the reply was empty, but file was uploaded successfully.\\n\\nSo, below is working for me with jQuery 1.9.1 and the newest version of the \"jQuery File Upload Plugin\" - 5.21.3\\n\\n```\\n$(\"#fileupload\").bind(\"fileuploaddone\", function (e, data) {\\n    console.log(\"fileuploaddone event fired\");\\n});\\n\\n```'}"
+      ]
+     },
+     "execution_count": 93,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds_result[\"finetune\"][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 94,
+   "id": "2c96653b-7a5a-4cae-a327-b6aa77aa5850",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_result = ds_result.remove_columns([\"answers\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "id": "15c2e5ee-7c7d-4e98-9e63-e5d37a9354aa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    finetune: Dataset({\n",
+       "        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 7440923\n",
+       "    })\n",
+       "    reward: Dataset({\n",
+       "        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 7441998\n",
+       "    })\n",
+       "    rl: Dataset({\n",
+       "        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 7435908\n",
+       "    })\n",
+       "    evaluation: Dataset({\n",
+       "        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+       "        num_rows: 4483004\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 95,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds_result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "id": "4d42b35c-5252-4b49-ba4b-20818bc9e086",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "finetune\n",
+      "reward\n",
+      "rl\n",
+      "evaluation\n"
+     ]
+    }
+   ],
+   "source": [
+    "for key in ds_result:\n",
+    "    print(key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "id": "e32c11d7-a88e-4d92-9dfc-92b2a67c5455",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import time\n",
+    "from multiprocessing import Pool\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "from huggingface_hub import Repository\n",
+    "\n",
+    "\n",
+    "def save_shard(shard_tuple):\n",
+    "    \"\"\"Save shard\"\"\"\n",
+    "    filename, shard = shard_tuple\n",
+    "    # use to_json instead to save as json file\n",
+    "    shard.to_parquet(filename)\n",
+    "\n",
+    "\n",
+    "def save_manual_shards(ds, user=\"lvwerra\", remote_dataset_repo=\"stack-exchange-paired\", subfolder=\"train\"):\n",
+    "    \"\"\"Save sharded data\n",
+    "    Args:\n",
+    "        ds (Dataset): dataset to be saved\n",
+    "        user (str): user name\n",
+    "        remote_dataset_repo (str): remote dataset repository\n",
+    "        out_path (str): path to save the shards\"\"\"\n",
+    "    # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO\n",
+    "    # you can save the shards inside it and do git add/commit/push to push data to the hub\n",
+    "    out_path = remote_dataset_repo\n",
+    "    # if out path doesnt already exist\n",
+    "    if not os.path.exists(out_path):\n",
+    "        repo = Repository(\n",
+    "            local_dir=out_path,\n",
+    "            clone_from=user + \"/\" + remote_dataset_repo,\n",
+    "            repo_type=\"dataset\",\n",
+    "            private=False,\n",
+    "            use_auth_token=True,\n",
+    "            git_user=user,\n",
+    "        )\n",
+    "\n",
+    "    # files will be numerous we save them in a folder called data inside out_path\n",
+    "    if not os.path.exists(out_path):\n",
+    "        os.mkdir(out_path + \"/data\")\n",
+    "    os.mkdir(out_path + f\"/data/{subfolder}\")\n",
+    "    \n",
+    "    SHARD_SIZE = 1000 << 20\n",
+    "    if ds._indices is not None:\n",
+    "        dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)\n",
+    "    else:\n",
+    "        dataset_nbytes = ds.data.nbytes\n",
+    "    num_shards = int(dataset_nbytes / SHARD_SIZE) + 1\n",
+    "    print(f\"Number of shards: {num_shards}\")\n",
+    "\n",
+    "    print(\"sharding the dataset\")\n",
+    "    t_start = time.time()\n",
+    "    shards = (\n",
+    "        ds.shard(num_shards=num_shards, index=i, contiguous=True)\n",
+    "        for i in range(num_shards)\n",
+    "    )\n",
+    "    # use f\"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json\" instead for json files\n",
+    "    filenames = (\n",
+    "        f\"{out_path}/data/{subfolder}/train-{index:05d}-of-{num_shards:05d}.parquet\"\n",
+    "        for index in range(num_shards)\n",
+    "    )\n",
+    "\n",
+    "    with Pool(16) as p:\n",
+    "        list(\n",
+    "            tqdm(\n",
+    "                p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),\n",
+    "                total=num_shards,\n",
+    "            )\n",
+    "        )\n",
+    "    print(f\"Time to save dataset: {time.time()-t_start:.2f}\")\n",
+    "    # to push dataset to hub do: git add/commit/push inside OUT_PATH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "id": "a90664eb-5c54-4fae-9a8a-d509bb2abdfe",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of shards: 20\n",
+      "sharding the dataset\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:28<00:00,  1.43s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time to save dataset: 29.15\n",
+      "Number of shards: 20\n",
+      "sharding the dataset\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00,  1.15s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time to save dataset: 23.42\n",
+      "Number of shards: 20\n",
+      "sharding the dataset\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:10<00:00,  1.83it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time to save dataset: 11.36\n",
+      "Number of shards: 12\n",
+      "sharding the dataset\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:10<00:00,  1.12it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time to save dataset: 11.13\n"
+     ]
+    }
+   ],
+   "source": [
+    "for key in ds_result:\n",
+    "    save_manual_shards(ds_result[key], subfolder=key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d62f5a7f-2a23-4e0d-9e49-b29f88ea8c13",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From e670afadf029d54192df72e6534ecb0aad4610aa Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 21 Sep 2023 13:54:59 +0200
Subject: [PATCH 4/5] add notebook

---
 .../pull-requests/reconstruct_prs.ipynb       | 5043 +++++++++++++++++
 1 file changed, 5043 insertions(+)
 create mode 100644 data_analysis/pull-requests/reconstruct_prs.ipynb

diff --git a/data_analysis/pull-requests/reconstruct_prs.ipynb b/data_analysis/pull-requests/reconstruct_prs.ipynb
new file mode 100644
index 0000000..9bc724c
--- /dev/null
+++ b/data_analysis/pull-requests/reconstruct_prs.ipynb
@@ -0,0 +1,5043 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: python-dateutil in /Users/loubnabenallal/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages (2.8.2)\n",
+      "Requirement already satisfied: six>=1.5 in /Users/loubnabenallal/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages (from python-dateutil) (1.16.0)\n",
+      "\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install python-dateutil"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 329,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading readme: 100%|██████████| 8.02k/8.02k [00:00<00:00, 1.52MB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "from dateutil.parser import parse\n",
+    "from datasets import load_dataset, Dataset\n",
+    "\n",
+    "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 330,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "from dateutil.parser import parse\n",
+    "from datasets import load_dataset, Dataset\n",
+    "\n",
+    "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n",
+    "\n",
+    "size = 500_000\n",
+    "\n",
+    "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n",
+    "\n",
+    "# 10k subset of random samples from ds\n",
+    "fianl_ds = list(ds.take(size))\n",
+    "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 332,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 365,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['pull_request.guid', 'pull_request.code_review_events', 'pull_request.events', 'pull_request.issue_events', 'bucket', '__index_level_0__'],\n",
+       "    num_rows: 500000\n",
+       "})"
+      ]
+     },
+     "execution_count": 365,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "from dateutil.parser import parse\n",
+    "from datasets import load_dataset, Dataset\n",
+    "\n",
+    "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n",
+    "\n",
+    "size = 500_000\n",
+    "\n",
+    "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n",
+    "\n",
+    "# 10k subset of random samples from ds\n",
+    "fianl_ds = list(ds.take(size))\n",
+    "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 335,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'__index_level_0__': 46164,\n",
+      " 'bucket': None,\n",
+      " 'pull_request.code_review_events': None,\n",
+      " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n",
+      "                        '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n",
+      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
+      "                        'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n",
+      "                        '158894695, \"public\": true, \"created_at\": '\n",
+      "                        '\"2020-11-23T05:58:40Z\", \"org.id\": null, \"org.login\": '\n",
+      "                        'null, \"pull_request.id\": 525472638, '\n",
+      "                        '\"pull_request.number\": 2460, \"pull_request.state\": '\n",
+      "                        '\"open\", \"pull_request.title\": \"[pull] master from '\n",
+      "                        'kubernetes:master\", \"pull_request.body\": \"See Commits '\n",
+      "                        'and Changes for more details.\\\\n\\\\n-----\\\\nCreated by '\n",
+      "                        '[<img src=\\\\\"https://prod.download/pull-18h-svg\\\\\" '\n",
+      "                        'valign=\\\\\"bottom\\\\\"/> '\n",
+      "                        '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n",
+      "                        'you help keep this open source service alive? '\n",
+      "                        '**[\\\\ud83d\\\\udc96 Please sponsor : '\n",
+      "                        ')](https://prod.download/pull-pr-sponsor)**_\", '\n",
+      "                        '\"pull_request.user.login\": \"pull[bot]\", '\n",
+      "                        '\"pull_request.user.id\": 39814207, '\n",
+      "                        '\"pull_request.author_association\": \"NONE\", '\n",
+      "                        '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n",
+      "                        '\"pull_request.updated_at\": \"2020-11-23T05:58:39Z\", '\n",
+      "                        '\"pull_request.closed_at\": null, '\n",
+      "                        '\"pull_request.merged_at\": null, '\n",
+      "                        '\"pull_request.merge_commit_sha\": null, '\n",
+      "                        '\"pull_request.locked\": false, '\n",
+      "                        '\"pull_request.assignee.login\": null, '\n",
+      "                        '\"pull_request.assignee.id\": null, '\n",
+      "                        '\"pull_request.assignee.type\": null, '\n",
+      "                        '\"pull_request.assignee.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.id\": null, '\n",
+      "                        '\"pull_request.milestone.number\": null, '\n",
+      "                        '\"pull_request.milestone.title\": null, '\n",
+      "                        '\"pull_request.milestone.description\": null, '\n",
+      "                        '\"pull_request.milestone.creator.login\": null, '\n",
+      "                        '\"pull_request.milestone.creator.id\": null, '\n",
+      "                        '\"pull_request.milestone.creator.type\": null, '\n",
+      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.open_issues\": null, '\n",
+      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
+      "                        '\"pull_request.milestone.state\": null, '\n",
+      "                        '\"pull_request.milestone.created_at\": null, '\n",
+      "                        '\"pull_request.milestone.updated_at\": null, '\n",
+      "                        '\"pull_request.milestone.due_on\": null, '\n",
+      "                        '\"pull_request.milestone.closed_at\": null, '\n",
+      "                        '\"pull_request.merged\": false, '\n",
+      "                        '\"pull_request.mergeable\": null, '\n",
+      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
+      "                        '\"pull_request.merged_by.login\": null, '\n",
+      "                        '\"pull_request.merged_by.id\": null, '\n",
+      "                        '\"pull_request.merged_by.type\": null, '\n",
+      "                        '\"pull_request.merged_by.site_admin\": null, '\n",
+      "                        '\"pull_request.comments\": 0, '\n",
+      "                        '\"pull_request.review_comments\": 0, '\n",
+      "                        '\"pull_request.commits\": 4, \"pull_request.additions\": '\n",
+      "                        '243, \"pull_request.deletions\": 0, '\n",
+      "                        '\"pull_request.changed_files\": 2, '\n",
+      "                        '\"pull_request.label.id\": null, '\n",
+      "                        '\"pull_request.label.name\": null, '\n",
+      "                        '\"pull_request.label.color\": null, '\n",
+      "                        '\"pull_request.label.default\": null, '\n",
+      "                        '\"pull_request.head.label\": \"kubernetes:master\", '\n",
+      "                        '\"pull_request.head.ref\": \"master\", '\n",
+      "                        '\"pull_request.head.sha\": '\n",
+      "                        '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n",
+      "                        '\"pull_request.head.user.login\": \"kubernetes\", '\n",
+      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.name\": \"website\", '\n",
+      "                        '\"pull_request.head.repo.full_name\": '\n",
+      "                        '\"kubernetes/website\", '\n",
+      "                        '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n",
+      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.private\": false, '\n",
+      "                        '\"pull_request.head.repo.homepage\": '\n",
+      "                        '\"https://kubernetes.io\", '\n",
+      "                        '\"pull_request.head.repo.description\": \"Kubernetes '\n",
+      "                        'website and documentation repo: \", '\n",
+      "                        '\"pull_request.head.repo.fork\": false, '\n",
+      "                        '\"pull_request.head.repo.created_at\": '\n",
+      "                        '\"2016-02-10T22:46:48Z\", '\n",
+      "                        '\"pull_request.head.repo.updated_at\": '\n",
+      "                        '\"2020-11-23T02:09:41Z\", '\n",
+      "                        '\"pull_request.head.repo.pushed_at\": '\n",
+      "                        '\"2020-11-23T05:12:37Z\", '\n",
+      "                        '\"pull_request.head.repo.size\": 319781, '\n",
+      "                        '\"pull_request.head.repo.stargazers_count\": 2267, '\n",
+      "                        '\"pull_request.head.repo.watchers_count\": 2267, '\n",
+      "                        '\"pull_request.head.repo.language\": \"HTML\", '\n",
+      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.head.repo.forks_count\": 8508, '\n",
+      "                        '\"pull_request.head.repo.archived\": false, '\n",
+      "                        '\"pull_request.head.repo.disabled\": false, '\n",
+      "                        '\"pull_request.head.repo.open_issues_count\": 641, '\n",
+      "                        '\"pull_request.head.repo.forks\": 8508, '\n",
+      "                        '\"pull_request.head.repo.open_issues\": 641, '\n",
+      "                        '\"pull_request.head.repo.watchers\": 2267, '\n",
+      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n",
+      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
+      "                        '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n",
+      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
+      "                        '\"pull_request.base.label\": \"kofj:master\", '\n",
+      "                        '\"pull_request.base.ref\": \"master\", '\n",
+      "                        '\"pull_request.base.sha\": '\n",
+      "                        '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n",
+      "                        '\"pull_request.base.user.login\": \"kofj\", '\n",
+      "                        '\"pull_request.base.user.type\": \"User\", '\n",
+      "                        '\"pull_request.base.repo.name\": \"website\", '\n",
+      "                        '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n",
+      "                        '\"pull_request.base.repo.owner.login\": \"kofj\", '\n",
+      "                        '\"pull_request.base.repo.owner.type\": \"User\", '\n",
+      "                        '\"pull_request.base.repo.private\": false, '\n",
+      "                        '\"pull_request.base.repo.homepage\": '\n",
+      "                        '\"https://kubernetes.io\", '\n",
+      "                        '\"pull_request.base.repo.description\": \"Kubernetes '\n",
+      "                        'website and documentation repo: \", '\n",
+      "                        '\"pull_request.base.repo.fork\": true, '\n",
+      "                        '\"pull_request.base.repo.created_at\": '\n",
+      "                        '\"2018-11-24T02:12:25Z\", '\n",
+      "                        '\"pull_request.base.repo.updated_at\": '\n",
+      "                        '\"2020-11-23T01:58:46Z\", '\n",
+      "                        '\"pull_request.base.repo.pushed_at\": '\n",
+      "                        '\"2020-11-23T01:58:43Z\", '\n",
+      "                        '\"pull_request.base.repo.size\": 286251, '\n",
+      "                        '\"pull_request.base.repo.stargazers_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.watchers_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.language\": \"HTML\", '\n",
+      "                        '\"pull_request.base.repo.has_issues\": false, '\n",
+      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.base.repo.forks_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.archived\": false, '\n",
+      "                        '\"pull_request.base.repo.disabled\": false, '\n",
+      "                        '\"pull_request.base.repo.open_issues_count\": 1, '\n",
+      "                        '\"pull_request.base.repo.forks\": 0, '\n",
+      "                        '\"pull_request.base.repo.open_issues\": 1, '\n",
+      "                        '\"pull_request.base.repo.watchers\": 0, '\n",
+      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n",
+      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
+      "                        '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n",
+      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
+      "                        '\"pull_request.guid\": \"kofj/website/pull/2460\"}, '\n",
+      "                        '{\"type\": \"PullRequestEvent\", \"action\": \"closed\", '\n",
+      "                        '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n",
+      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
+      "                        'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n",
+      "                        '158894695, \"public\": true, \"created_at\": '\n",
+      "                        '\"2020-11-23T05:58:50Z\", \"org.id\": null, \"org.login\": '\n",
+      "                        'null, \"pull_request.id\": 525472638, '\n",
+      "                        '\"pull_request.number\": 2460, \"pull_request.state\": '\n",
+      "                        '\"closed\", \"pull_request.title\": \"[pull] master from '\n",
+      "                        'kubernetes:master\", \"pull_request.body\": \"See '\n",
+      "                        '[Commits](/kofj/website/pull/2460/commits) and '\n",
+      "                        '[Changes](/kofj/website/pull/2460/files) for more '\n",
+      "                        'details.\\\\n\\\\n-----\\\\nCreated by [<img '\n",
+      "                        'src=\\\\\"https://prod.download/pull-18h-svg\\\\\" '\n",
+      "                        'valign=\\\\\"bottom\\\\\"/> '\n",
+      "                        '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n",
+      "                        'you help keep this open source service alive? '\n",
+      "                        '**[\\\\ud83d\\\\udc96 Please sponsor : '\n",
+      "                        ')](https://prod.download/pull-pr-sponsor)**_\", '\n",
+      "                        '\"pull_request.user.login\": \"pull[bot]\", '\n",
+      "                        '\"pull_request.user.id\": 39814207, '\n",
+      "                        '\"pull_request.author_association\": \"NONE\", '\n",
+      "                        '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n",
+      "                        '\"pull_request.updated_at\": \"2020-11-23T05:58:50Z\", '\n",
+      "                        '\"pull_request.closed_at\": \"2020-11-23T05:58:50Z\", '\n",
+      "                        '\"pull_request.merged_at\": \"2020-11-23T05:58:49Z\", '\n",
+      "                        '\"pull_request.merge_commit_sha\": '\n",
+      "                        '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n",
+      "                        '\"pull_request.locked\": false, '\n",
+      "                        '\"pull_request.assignee.login\": null, '\n",
+      "                        '\"pull_request.assignee.id\": null, '\n",
+      "                        '\"pull_request.assignee.type\": null, '\n",
+      "                        '\"pull_request.assignee.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.id\": null, '\n",
+      "                        '\"pull_request.milestone.number\": null, '\n",
+      "                        '\"pull_request.milestone.title\": null, '\n",
+      "                        '\"pull_request.milestone.description\": null, '\n",
+      "                        '\"pull_request.milestone.creator.login\": null, '\n",
+      "                        '\"pull_request.milestone.creator.id\": null, '\n",
+      "                        '\"pull_request.milestone.creator.type\": null, '\n",
+      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.open_issues\": null, '\n",
+      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
+      "                        '\"pull_request.milestone.state\": null, '\n",
+      "                        '\"pull_request.milestone.created_at\": null, '\n",
+      "                        '\"pull_request.milestone.updated_at\": null, '\n",
+      "                        '\"pull_request.milestone.due_on\": null, '\n",
+      "                        '\"pull_request.milestone.closed_at\": null, '\n",
+      "                        '\"pull_request.merged\": true, '\n",
+      "                        '\"pull_request.mergeable\": null, '\n",
+      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
+      "                        '\"pull_request.merged_by.login\": \"pull[bot]\", '\n",
+      "                        '\"pull_request.merged_by.id\": 39814207, '\n",
+      "                        '\"pull_request.merged_by.type\": \"Bot\", '\n",
+      "                        '\"pull_request.merged_by.site_admin\": false, '\n",
+      "                        '\"pull_request.comments\": 0, '\n",
+      "                        '\"pull_request.review_comments\": 0, '\n",
+      "                        '\"pull_request.commits\": 4, \"pull_request.additions\": '\n",
+      "                        '243, \"pull_request.deletions\": 0, '\n",
+      "                        '\"pull_request.changed_files\": 2, '\n",
+      "                        '\"pull_request.label.id\": null, '\n",
+      "                        '\"pull_request.label.name\": null, '\n",
+      "                        '\"pull_request.label.color\": null, '\n",
+      "                        '\"pull_request.label.default\": null, '\n",
+      "                        '\"pull_request.head.label\": \"kubernetes:master\", '\n",
+      "                        '\"pull_request.head.ref\": \"master\", '\n",
+      "                        '\"pull_request.head.sha\": '\n",
+      "                        '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n",
+      "                        '\"pull_request.head.user.login\": \"kubernetes\", '\n",
+      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.name\": \"website\", '\n",
+      "                        '\"pull_request.head.repo.full_name\": '\n",
+      "                        '\"kubernetes/website\", '\n",
+      "                        '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n",
+      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.private\": false, '\n",
+      "                        '\"pull_request.head.repo.homepage\": '\n",
+      "                        '\"https://kubernetes.io\", '\n",
+      "                        '\"pull_request.head.repo.description\": \"Kubernetes '\n",
+      "                        'website and documentation repo: \", '\n",
+      "                        '\"pull_request.head.repo.fork\": false, '\n",
+      "                        '\"pull_request.head.repo.created_at\": '\n",
+      "                        '\"2016-02-10T22:46:48Z\", '\n",
+      "                        '\"pull_request.head.repo.updated_at\": '\n",
+      "                        '\"2020-11-23T02:09:41Z\", '\n",
+      "                        '\"pull_request.head.repo.pushed_at\": '\n",
+      "                        '\"2020-11-23T05:12:37Z\", '\n",
+      "                        '\"pull_request.head.repo.size\": 319781, '\n",
+      "                        '\"pull_request.head.repo.stargazers_count\": 2267, '\n",
+      "                        '\"pull_request.head.repo.watchers_count\": 2267, '\n",
+      "                        '\"pull_request.head.repo.language\": \"HTML\", '\n",
+      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.head.repo.forks_count\": 8508, '\n",
+      "                        '\"pull_request.head.repo.archived\": false, '\n",
+      "                        '\"pull_request.head.repo.disabled\": false, '\n",
+      "                        '\"pull_request.head.repo.open_issues_count\": 641, '\n",
+      "                        '\"pull_request.head.repo.forks\": 8508, '\n",
+      "                        '\"pull_request.head.repo.open_issues\": 641, '\n",
+      "                        '\"pull_request.head.repo.watchers\": 2267, '\n",
+      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n",
+      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
+      "                        '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n",
+      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
+      "                        '\"pull_request.base.label\": \"kofj:master\", '\n",
+      "                        '\"pull_request.base.ref\": \"master\", '\n",
+      "                        '\"pull_request.base.sha\": '\n",
+      "                        '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n",
+      "                        '\"pull_request.base.user.login\": \"kofj\", '\n",
+      "                        '\"pull_request.base.user.type\": \"User\", '\n",
+      "                        '\"pull_request.base.repo.name\": \"website\", '\n",
+      "                        '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n",
+      "                        '\"pull_request.base.repo.owner.login\": \"kofj\", '\n",
+      "                        '\"pull_request.base.repo.owner.type\": \"User\", '\n",
+      "                        '\"pull_request.base.repo.private\": false, '\n",
+      "                        '\"pull_request.base.repo.homepage\": '\n",
+      "                        '\"https://kubernetes.io\", '\n",
+      "                        '\"pull_request.base.repo.description\": \"Kubernetes '\n",
+      "                        'website and documentation repo: \", '\n",
+      "                        '\"pull_request.base.repo.fork\": true, '\n",
+      "                        '\"pull_request.base.repo.created_at\": '\n",
+      "                        '\"2018-11-24T02:12:25Z\", '\n",
+      "                        '\"pull_request.base.repo.updated_at\": '\n",
+      "                        '\"2020-11-23T01:58:46Z\", '\n",
+      "                        '\"pull_request.base.repo.pushed_at\": '\n",
+      "                        '\"2020-11-23T05:58:46Z\", '\n",
+      "                        '\"pull_request.base.repo.size\": 286251, '\n",
+      "                        '\"pull_request.base.repo.stargazers_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.watchers_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.language\": \"HTML\", '\n",
+      "                        '\"pull_request.base.repo.has_issues\": false, '\n",
+      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.base.repo.forks_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.archived\": false, '\n",
+      "                        '\"pull_request.base.repo.disabled\": false, '\n",
+      "                        '\"pull_request.base.repo.open_issues_count\": 0, '\n",
+      "                        '\"pull_request.base.repo.forks\": 0, '\n",
+      "                        '\"pull_request.base.repo.open_issues\": 0, '\n",
+      "                        '\"pull_request.base.repo.watchers\": 0, '\n",
+      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n",
+      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
+      "                        '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n",
+      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
+      "                        '\"pull_request.guid\": \"kofj/website/pull/2460\"}]',\n",
+      " 'pull_request.guid': 'kofj/website/pull/2460',\n",
+      " 'pull_request.issue_events': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pprint import pprint\n",
+    "\n",
+    "pprint(ds[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 412,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# merge all three instances\n",
+    "\n",
+    "pull_request_info_cols = [\n",
+    "    \"repo.name\",\n",
+    "    \"repo.id\",\n",
+    "    \"org.id\",\n",
+    "    \"public\",\n",
+    "    \"pull_request.id\",\n",
+    "    \"pull_request.guid\",\n",
+    "    \"pull_request.number\",\n",
+    "    \"pull_request.title\",\n",
+    "    \"pull_request.body\",\n",
+    "    \"pull_request.state\",\n",
+    "    \"pull_request.user.login\",\n",
+    "    \"pull_request.user.id\",\n",
+    "    # add user type\n",
+    "    \"pull_request.head.user.type\",\n",
+    "    \"pull_request.base.user.type\",\n",
+    "    \"pull_request.created_at\",\n",
+    "    \"pull_request.closed_at\",\n",
+    "    \"pull_request.merged_at\",\n",
+    "    \"pull_request.merged_by.login\",\n",
+    "    \"pull_request.milestone.title\",\n",
+    "    \"pull_request.milestone.description\",\n",
+    "    \"pull_request.milestone.number\",\n",
+    "    # commits\n",
+    "    'pull_request.commits',\n",
+    "    'pull_request.additions',\n",
+    "    'pull_request.deletions',\n",
+    "    # changed files\n",
+    "    'pull_request.changed_files',\n",
+    "    \"pull_request.comments\",\n",
+    "    \"pull_request.review_comments\",\n",
+    "]\n",
+    "\n",
+    "head_info_cols = [\n",
+    "    \"pull_request.head.label\",\n",
+    "    \"pull_request.head.ref\",\n",
+    "    \"pull_request.head.user.login\",\n",
+    "    \"pull_request.head.user.type\",\n",
+    "    \"pull_request.head.repo.owner.login\",\n",
+    "    \"pull_request.head.repo.owner.type\",\n",
+    "    \"pull_request.head.repo.license.name\",\n",
+    "    \"pull_request.head.sha\",\n",
+    "    'pull_request.head.repo.name',\n",
+    "    'pull_request.head.repo.owner.login',\n",
+    "    'pull_request.head.repo.homepage',\n",
+    "    'pull_request.head.repo.description',\n",
+    "    'pull_request.head.repo.language',\n",
+    "    'pull_request.head.repo.stargazers_count',\n",
+    "    'pull_request.head.repo.license.name',\n",
+    "    'pull_request.head.repo.default_branch',\n",
+    "    'pull_request.head.repo.private'\n",
+    "]\n",
+    "base_info_cols = [\n",
+    "    \"pull_request.base.label\",\n",
+    "    \"pull_request.base.ref\",\n",
+    "    \"pull_request.base.sha\",\n",
+    "    \"pull_request.base.user.login\",\n",
+    "    \"pull_request.base.user.type\",\n",
+    "    \"pull_request.base.repo.owner.login\",\n",
+    "    \"pull_request.base.repo.owner.type\",\n",
+    "    \"pull_request.base.repo.license.name\",\n",
+    "    \"pull_request.base.repo.default_branch\",\n",
+    "    \"pull_request.base.repo.description\",\n",
+    "    \"pull_request.base.repo.language\",\n",
+    "    \"pull_request.base.repo.watchers_count\",\n",
+    "    \"pull_request.base.repo.open_issues_count\",\n",
+    "    \"pull_request.base.repo.forks_count\",\n",
+    "    'pull_request.base.repo.name',\n",
+    "    'pull_request.base.repo.owner.login',\n",
+    "    'pull_request.base.repo.homepage',\n",
+    "    'pull_request.base.repo.description',\n",
+    "    'pull_request.base.repo.language',\n",
+    "    'pull_request.base.repo.stargazers_count',\n",
+    "    'pull_request.base.repo.private',\n",
+    "    'pull_request.comments',\n",
+    "    'pull_request.review_comments',\n",
+    "    'pull_request.label.name',\n",
+    "]\n",
+    "\n",
+    "reviews_info = [# review events only\n",
+    "    'actor.login',\n",
+    "    'actor.id',\n",
+    "    'user.login',\n",
+    "    'user.type',\n",
+    "    'review.state',\n",
+    "    'review.id', \n",
+    "    'review.body', \n",
+    "    'review.commit_id', \n",
+    "    'review.submitted_at', \n",
+    "    'review.author_association',\n",
+    "    \"pull_request.state\",\n",
+    "    \"pull_request.merged\",\n",
+    "    \"pull_request.merged_by.login\",\n",
+    "    \"pull_request.merged_by.type\",\n",
+    "    # comments\n",
+    "    'comment.id',\n",
+    "    'comment.diff_hunk',\n",
+    "    'comment.body',\n",
+    "    'comment.path',\n",
+    "    'comment.position',\n",
+    "    'comment.original_position',\n",
+    "    'comment.commit_id',\n",
+    "    'comment.original_commit_id',\n",
+    "    'comment.created_at',\n",
+    "    'comment.updated_at',\n",
+    "    'comment.author_association',\n",
+    "    'comment.start_line',\n",
+    "    'comment.original_start_line',\n",
+    "    'comment.start_side',\n",
+    "    'comment.line',\n",
+    "    'comment.original_line',\n",
+    "    'comment.side',\n",
+    "    'comment.in_reply_to_id',]\n",
+    "\n",
+    "\n",
+    "issues_info = [\n",
+    " 'author',\n",
+    " 'comment',\n",
+    " 'comment_id']\n",
+    " \n",
+    "event_info = reviews_info + issues_info\n",
+    "\n",
+    "def get_event_info(review):\n",
+    "    res = {k: review[k] if k in review else None for k in event_info}\n",
+    "    # for keys in issues_info add prefix issue.\n",
+    "    for k in issues_info:\n",
+    "        res[\"issue.\" + k] = res[k]\n",
+    "        del res[k]\n",
+    "    return res\n",
+    "\n",
+    "def load_json(data):\n",
+    "    try:\n",
+    "        data = json.loads(data)\n",
+    "        if isinstance(data, dict):\n",
+    "            data = [data]\n",
+    "        return data\n",
+    "    except TypeError:\n",
+    "        return []\n",
+    "\n",
+    "def update_datetime(e):\n",
+    "    e[\"created_at\"] = parse(e[\"created_at\"])\n",
+    "    return e\n",
+    "\n",
+    "def merge_events(row):\n",
+    "    events = load_json(row[\"pull_request.events\"])\n",
+    "    reviews = load_json(row[\"pull_request.code_review_events\"])\n",
+    "    issues = load_json(row[\"pull_request.issue_events\"])\n",
+    "\n",
+    "    assert len(issues) <= 1\n",
+    "    if issues:\n",
+    "        issues_events = issues[0][\"events\"]\n",
+    "        # for each events in each category group all events sorted by \"created_at\" in one list\n",
+    "        for e in issues_events:\n",
+    "            e[\"created_at\"] = parse(e[\"datetime\"])\n",
+    "            del e[\"datetime\"]\n",
+    "    else:\n",
+    "        issues_events = []\n",
+    "    events = [update_datetime(e) for e in events]\n",
+    "    reviews = [update_datetime(e) for e in reviews]\n",
+    "    all_events = sorted(\n",
+    "        events + reviews + issues_events,\n",
+    "        key=lambda x: x[\"created_at\"]\n",
+    "    )\n",
+    "    try:\n",
+    "        base_data = events[0] if events else reviews[0]\n",
+    "    except IndexError:\n",
+    "        if issues:\n",
+    "            base_data = issues_events[0]\n",
+    "            first_event = issues[0][\"events\"][0]\n",
+    "            base_data['pull_request.title'] = first_event[\"title\"]\n",
+    "            print(f'base data keys: {base_data.keys()}')\n",
+    "            base_data[\"repo.name\"] = base_data[\"repo\"]\n",
+    "            base_data[\"org.id\"] = base_data[\"org\"]\n",
+    "            base_data[\"repo.name\"] = base_data[\"repo\"]\n",
+    "            base_data[\"pull_request.number\"] = int(base_data[\"pull_request\"][\"number\"])\n",
+    "            base_data[\"pull_request.user.login\"] = base_data[\"pull_request\"][\"user_login\"]\n",
+    "            print(\"filling PR data from issue event\")\n",
+    "        else:\n",
+    "            raise IndexError(\"No events for PR\")\n",
+    "    \n",
+    "    # Initialize with default values\n",
+    "    pr_info = {k: None for k in pull_request_info_cols}\n",
+    "    head_info = {k: None for k in head_info_cols}\n",
+    "    base_info = {k: None for k in base_info_cols}\n",
+    "\n",
+    "    # Fill available data\n",
+    "    pr_info.update({k: base_data[k] if k in base_data else None for k in pull_request_info_cols})\n",
+    "    head_info.update({k: base_data[k] if k in base_data else None  for k in head_info_cols })\n",
+    "    base_info.update({k: base_data[k]  if k in base_data else None for k in base_info_cols})\n",
+    "\n",
+    "    # each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
+    "    comments = [{\"type\": e[\"type\"],\n",
+    "                \"action\": e[\"action\"],\n",
+    "                \"created_at\": e[\"created_at\"],\n",
+    "                **get_event_info(e)} for e in all_events]\n",
+    "    new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}\n",
+    "    return new_row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 413,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'repo'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[412], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m     base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n",
+      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[413], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n",
+      "Cell \u001b[0;32mIn[412], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'repo'"
+     ]
+    }
+   ],
+   "source": [
+    "new_row = merge_events(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 411,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                    \r"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'repo'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[410], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m     base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n",
+      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[411], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m small_ds_2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39mselect(\u001b[39mrange\u001b[39m(\u001b[39m1000\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m dd \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events)\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    578\u001b[0m     \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m    583\u001b[0m     \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m    539\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m    540\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m    541\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m    542\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m    543\u001b[0m }\n\u001b[1;32m    544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m   3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   3080\u001b[0m     \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m   3081\u001b[0m         disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m   3082\u001b[0m         unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   3085\u001b[0m         desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   3086\u001b[0m     ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m         \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m   3088\u001b[0m             \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m   3089\u001b[0m                 shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m   3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m   3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m     example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m   3442\u001b[0m     \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m   3443\u001b[0m         \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m   3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m   3343\u001b[0m     additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m   3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m   3346\u001b[0m     processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m   3347\u001b[0m         k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m   3348\u001b[0m     }\n",
+      "Cell \u001b[0;32mIn[410], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'repo'"
+     ]
+    }
+   ],
+   "source": [
+    "small_ds_2 = ds.select(range(1000))\n",
+    "dd = small_ds_2.map(merge_events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 405,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['action', 'author', 'comment', 'comment_id', 'description', 'title', 'type', 'created_at'])"
+      ]
+     },
+     "execution_count": 405,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues_events[0].keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 366,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'pull_request.guid': 'karen-kua/COVID-19_Tracker/pull/15',\n",
+       " 'pull_request.code_review_events': None,\n",
+       " 'pull_request.events': None,\n",
+       " 'pull_request.issue_events': '{\"repo\": \"karen-kua/COVID-19_Tracker\", \"org\": null, \"issue_id\": 1018615993, \"issue_number\": 15, \"pull_request\": {\"number\": 15.0, \"repo\": \"COVID-19_Tracker\", \"user_login\": \"karen-kua\"}, \"events\": [{\"action\": \"opened\", \"author\": \"dependabot[bot]\", \"comment\": null, \"comment_id\": null, \"datetime\": \"2021-10-06T15:46:43Z\", \"description\": \"Bumps [url-parse](https://github.com/unshiftio/url-parse) from 1.4.7 to 1.5.3.\\\\n<details>\\\\n<summary>Commits</summary>\\\\n<ul>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/ad444931666a30bad11472d89a216461cf16cae2\\\\\"><code>ad44493</code></a> [dist] 1.5.3</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/c7984617e235892cc22e0f47bb5ff1c012e6e39f\\\\\"><code>c798461</code></a> [fix] Fix host parsing for file URLs (<a href=\\\\\"https://github-redirect.dependabot.com/unshiftio/url-parse/issues/210\\\\\">#210</a>)</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/201034b8670c2aa382d7ec410ee750ac6f2f9c38\\\\\"><code>201034b</code></a> [dist] 1.5.2</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/2d9ac2c94067742b2116332c1e03be9f37371dff\\\\\"><code>2d9ac2c</code></a> [fix] Sanitize only special URLs (<a href=\\\\\"https://github-redirect.dependabot.com/unshiftio/url-parse/issues/209\\\\\">#209</a>)</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/fb128af4f43fa17f351d50cf615c7598c751f50a\\\\\"><code>fb128af</code></a> [fix] Use <code>\\'null\\'</code> as <code>origin</code> for non special URLs</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/fed6d9e338ea39de2d68bb66607066d71328c62f\\\\\"><code>fed6d9e</code></a> [fix] Add a leading slash only if the URL is special</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/94872e7ab9103ee69b958959baa14c9e682a7f10\\\\\"><code>94872e7</code></a> [fix] Do not incorrectly set the <code>slashes</code> property to <code>true</code></li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/81ab967889b08112d3356e451bf03e6aa0cbb7e0\\\\\"><code>81ab967</code></a> [fix] Ignore slashes after the protocol for special URLs</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/ee22050a48a67409aa5f7c87947284156d615bd1\\\\\"><code>ee22050</code></a> [ci] Use GitHub Actions</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/d2979b586d8c7751e0c77f127d9ce1b2143cc0c9\\\\\"><code>d2979b5</code></a> [fix] Special case the <code>file:</code> protocol (<a href=\\\\\"https://github-redirect.dependabot.com/unshiftio/url-parse/issues/204\\\\\">#204</a>)</li>\\\\n<li>Additional commits viewable in <a href=\\\\\"https://github.com/unshiftio/url-parse/compare/1.4.7...1.5.3\\\\\">compare view</a></li>\\\\n</ul>\\\\n</details>\\\\n<br />\\\\n\\\\n\\\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=url-parse&package-manager=npm_and_yarn&previous-version=1.4.7&new-version=1.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\\\n\\\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\\\n\\\\n[//]: # (dependabot-automerge-start)\\\\n[//]: # (dependabot-automerge-end)\\\\n\\\\n---\\\\n\\\\n<details>\\\\n<summary>Dependabot commands and options</summary>\\\\n<br />\\\\n\\\\nYou can trigger Dependabot actions by commenting on this PR:\\\\n- `@dependabot rebase` will rebase this PR\\\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\\\n- `@dependabot merge` will merge this PR after your CI passes on it\\\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\\\n- `@dependabot reopen` will reopen this PR if it is closed\\\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\\\n\\\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/azukimochi/COVID-19_Tracker/network/alerts).\\\\n\\\\n</details>\", \"title\": \"Bump url-parse from 1.4.7 to 1.5.3\", \"type\": \"issue\"}, {\"action\": \"created\", \"author\": \"dependabot[bot]\", \"comment\": \"Superseded by #17.\", \"comment_id\": 1045459471.0, \"datetime\": \"2022-02-19 00:53:17+00:00\", \"description\": null, \"title\": null, \"type\": \"comment\"}]}',\n",
+       " 'bucket': '940',\n",
+       " '__index_level_0__': 72946}"
+      ]
+     },
+     "execution_count": 366,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 360,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])"
+      ]
+     },
+     "execution_count": 360,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues[0].keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 361,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'number': 15.0, 'repo': 'COVID-19_Tracker', 'user_login': 'karen-kua'}"
+      ]
+     },
+     "execution_count": 361,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues[0][\"pull_request\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 351,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "small_ds_2 = ds.select(range(500))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 398,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                    \r"
+     ]
+    },
+    {
+     "ename": "KeyError",
+     "evalue": "'events'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[396], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m     base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n",
+      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[398], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m merged_ds \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events, remove_columns\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mpull_request.events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.code_review_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.issue_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m__index_level_0__\u001b[39;49m\u001b[39m'\u001b[39;49m,\u001b[39m'\u001b[39;49m\u001b[39mpull_request.guid\u001b[39;49m\u001b[39m'\u001b[39;49m])\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    578\u001b[0m     \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m    583\u001b[0m     \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m    539\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m    540\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m    541\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m    542\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m    543\u001b[0m }\n\u001b[1;32m    544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m   3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   3080\u001b[0m     \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m   3081\u001b[0m         disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m   3082\u001b[0m         unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   3085\u001b[0m         desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   3086\u001b[0m     ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m         \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m   3088\u001b[0m             \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m   3089\u001b[0m                 shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m   3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m   3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m     example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m   3442\u001b[0m     \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m   3443\u001b[0m         \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n",
+      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m   3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m   3343\u001b[0m     additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m   3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m   3346\u001b[0m     processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m   3347\u001b[0m         k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m   3348\u001b[0m     }\n",
+      "Cell \u001b[0;32mIn[396], line 170\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n\u001b[1;32m    169\u001b[0m     base_data \u001b[39m=\u001b[39m issues_events[\u001b[39m0\u001b[39m]\n\u001b[0;32m--> 170\u001b[0m     first_event \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mevents\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    171\u001b[0m     base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    172\u001b[0m     base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'events'"
+     ]
+    }
+   ],
+   "source": [
+    "merged_ds = small_ds_2.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 31.42ba/s]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:10<00:00, 10.30s/it]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:11<00:00, 11.45s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "merged_ds.push_to_hub(\"loubnabnl/code_reviews_3\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading readme: 100%|██████████| 5.88k/5.88k [00:00<00:00, 3.76MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset None/None to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading data: 100%|██████████| 16.1M/16.1M [00:00<00:00, 17.4MB/s]\n",
+      "Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it]\n",
+      "Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 676.50it/s]\n",
+      "                                                                                       \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset parquet downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['bucket', 'pull_request_info', 'head_repo_info', 'base_repo_info', 'events'],\n",
+       "    num_rows: 10000\n",
+       "})"
+      ]
+     },
+     "execution_count": 72,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'__index_level_0__': 1028,\n",
+      " 'bucket': None,\n",
+      " 'pull_request.code_review_events': None,\n",
+      " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n",
+      "                        '\"actor.login\": \"M-Davies\", \"actor.id\": 25231953, '\n",
+      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
+      "                        'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n",
+      "                        '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n",
+      "                        '\"2020-05-28T09:45:30Z\", \"org.id\": 1673867, '\n",
+      "                        '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n",
+      "                        '424372800, \"pull_request.number\": 1787, '\n",
+      "                        '\"pull_request.state\": \"open\", \"pull_request.title\": '\n",
+      "                        '\"Revert \\'Fire installer failure on all failed '\n",
+      "                        'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n",
+      "                        'lot of false positives or just doesnt work overall. '\n",
+      "                        'Better to just remove for '\n",
+      "                        'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n",
+      "                        '<morgan.davies@ibm.com>\", \"pull_request.user.login\": '\n",
+      "                        '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n",
+      "                        '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n",
+      "                        '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n",
+      "                        '\"pull_request.updated_at\": \"2020-05-28T09:45:30Z\", '\n",
+      "                        '\"pull_request.closed_at\": null, '\n",
+      "                        '\"pull_request.merged_at\": null, '\n",
+      "                        '\"pull_request.merge_commit_sha\": null, '\n",
+      "                        '\"pull_request.locked\": false, '\n",
+      "                        '\"pull_request.assignee.login\": null, '\n",
+      "                        '\"pull_request.assignee.id\": null, '\n",
+      "                        '\"pull_request.assignee.type\": null, '\n",
+      "                        '\"pull_request.assignee.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.id\": null, '\n",
+      "                        '\"pull_request.milestone.number\": null, '\n",
+      "                        '\"pull_request.milestone.title\": null, '\n",
+      "                        '\"pull_request.milestone.description\": null, '\n",
+      "                        '\"pull_request.milestone.creator.login\": null, '\n",
+      "                        '\"pull_request.milestone.creator.id\": null, '\n",
+      "                        '\"pull_request.milestone.creator.type\": null, '\n",
+      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.open_issues\": null, '\n",
+      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
+      "                        '\"pull_request.milestone.state\": null, '\n",
+      "                        '\"pull_request.milestone.created_at\": null, '\n",
+      "                        '\"pull_request.milestone.updated_at\": null, '\n",
+      "                        '\"pull_request.milestone.due_on\": null, '\n",
+      "                        '\"pull_request.milestone.closed_at\": null, '\n",
+      "                        '\"pull_request.merged\": false, '\n",
+      "                        '\"pull_request.mergeable\": null, '\n",
+      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
+      "                        '\"pull_request.merged_by.login\": null, '\n",
+      "                        '\"pull_request.merged_by.id\": null, '\n",
+      "                        '\"pull_request.merged_by.type\": null, '\n",
+      "                        '\"pull_request.merged_by.site_admin\": null, '\n",
+      "                        '\"pull_request.comments\": 0, '\n",
+      "                        '\"pull_request.review_comments\": 0, '\n",
+      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
+      "                        '4, \"pull_request.deletions\": 6, '\n",
+      "                        '\"pull_request.changed_files\": 1, '\n",
+      "                        '\"pull_request.label.id\": null, '\n",
+      "                        '\"pull_request.label.name\": null, '\n",
+      "                        '\"pull_request.label.color\": null, '\n",
+      "                        '\"pull_request.label.default\": null, '\n",
+      "                        '\"pull_request.head.label\": \"M-Davies:revert\", '\n",
+      "                        '\"pull_request.head.ref\": \"revert\", '\n",
+      "                        '\"pull_request.head.sha\": '\n",
+      "                        '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n",
+      "                        '\"pull_request.head.user.login\": \"M-Davies\", '\n",
+      "                        '\"pull_request.head.user.type\": \"User\", '\n",
+      "                        '\"pull_request.head.repo.name\": \"openjdk-build\", '\n",
+      "                        '\"pull_request.head.repo.full_name\": '\n",
+      "                        '\"M-Davies/openjdk-build\", '\n",
+      "                        '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n",
+      "                        '\"pull_request.head.repo.owner.type\": \"User\", '\n",
+      "                        '\"pull_request.head.repo.private\": false, '\n",
+      "                        '\"pull_request.head.repo.homepage\": \"\", '\n",
+      "                        '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n",
+      "                        'community OpenJDK build scripts - common across all '\n",
+      "                        'releases/versions\", \"pull_request.head.repo.fork\": '\n",
+      "                        'true, \"pull_request.head.repo.created_at\": '\n",
+      "                        '\"2019-11-29T09:24:43Z\", '\n",
+      "                        '\"pull_request.head.repo.updated_at\": '\n",
+      "                        '\"2020-05-27T14:45:16Z\", '\n",
+      "                        '\"pull_request.head.repo.pushed_at\": '\n",
+      "                        '\"2020-05-27T14:45:13Z\", '\n",
+      "                        '\"pull_request.head.repo.size\": 2383, '\n",
+      "                        '\"pull_request.head.repo.stargazers_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.watchers_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.language\": \"Shell\", '\n",
+      "                        '\"pull_request.head.repo.has_issues\": false, '\n",
+      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.head.repo.forks_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.archived\": false, '\n",
+      "                        '\"pull_request.head.repo.disabled\": false, '\n",
+      "                        '\"pull_request.head.repo.open_issues_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.forks\": 0, '\n",
+      "                        '\"pull_request.head.repo.open_issues\": 0, '\n",
+      "                        '\"pull_request.head.repo.watchers\": 0, '\n",
+      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n",
+      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
+      "                        '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n",
+      "                        '\"Apache License 2.0\", \"pull_request.base.label\": '\n",
+      "                        '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n",
+      "                        '\"master\", \"pull_request.base.sha\": '\n",
+      "                        '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n",
+      "                        '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n",
+      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.name\": \"openjdk-build\", '\n",
+      "                        '\"pull_request.base.repo.full_name\": '\n",
+      "                        '\"AdoptOpenJDK/openjdk-build\", '\n",
+      "                        '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n",
+      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.private\": false, '\n",
+      "                        '\"pull_request.base.repo.homepage\": \"\", '\n",
+      "                        '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n",
+      "                        'community OpenJDK build scripts - common across all '\n",
+      "                        'releases/versions\", \"pull_request.base.repo.fork\": '\n",
+      "                        'false, \"pull_request.base.repo.created_at\": '\n",
+      "                        '\"2017-03-17T09:31:50Z\", '\n",
+      "                        '\"pull_request.base.repo.updated_at\": '\n",
+      "                        '\"2020-05-28T07:45:12Z\", '\n",
+      "                        '\"pull_request.base.repo.pushed_at\": '\n",
+      "                        '\"2020-05-27T14:18:11Z\", '\n",
+      "                        '\"pull_request.base.repo.size\": 2234, '\n",
+      "                        '\"pull_request.base.repo.stargazers_count\": 620, '\n",
+      "                        '\"pull_request.base.repo.watchers_count\": 620, '\n",
+      "                        '\"pull_request.base.repo.language\": \"Shell\", '\n",
+      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.base.repo.forks_count\": 137, '\n",
+      "                        '\"pull_request.base.repo.archived\": false, '\n",
+      "                        '\"pull_request.base.repo.disabled\": false, '\n",
+      "                        '\"pull_request.base.repo.open_issues_count\": 166, '\n",
+      "                        '\"pull_request.base.repo.forks\": 137, '\n",
+      "                        '\"pull_request.base.repo.open_issues\": 166, '\n",
+      "                        '\"pull_request.base.repo.watchers\": 620, '\n",
+      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n",
+      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
+      "                        '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n",
+      "                        '\"Apache License 2.0\", \"pull_request.guid\": '\n",
+      "                        '\"AdoptOpenJDK/openjdk-build/pull/1787\"}, {\"type\": '\n",
+      "                        '\"PullRequestEvent\", \"action\": \"closed\", '\n",
+      "                        '\"actor.login\": \"sxa\", \"actor.id\": 6487691, '\n",
+      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
+      "                        'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n",
+      "                        '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n",
+      "                        '\"2020-05-28T09:51:49Z\", \"org.id\": 1673867, '\n",
+      "                        '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n",
+      "                        '424372800, \"pull_request.number\": 1787, '\n",
+      "                        '\"pull_request.state\": \"closed\", \"pull_request.title\": '\n",
+      "                        '\"Revert \\'Fire installer failure on all failed '\n",
+      "                        'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n",
+      "                        'lot of false positives or just doesnt work overall. '\n",
+      "                        'Better to just remove for '\n",
+      "                        'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n",
+      "                        '<morgan.davies@ibm.com>\", \"pull_request.user.login\": '\n",
+      "                        '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n",
+      "                        '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n",
+      "                        '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n",
+      "                        '\"pull_request.updated_at\": \"2020-05-28T09:51:48Z\", '\n",
+      "                        '\"pull_request.closed_at\": \"2020-05-28T09:51:48Z\", '\n",
+      "                        '\"pull_request.merged_at\": \"2020-05-28T09:51:48Z\", '\n",
+      "                        '\"pull_request.merge_commit_sha\": '\n",
+      "                        '\"4c3495c6f008459ca1c276477c5f968e9dcd7c6b\", '\n",
+      "                        '\"pull_request.locked\": false, '\n",
+      "                        '\"pull_request.assignee.login\": null, '\n",
+      "                        '\"pull_request.assignee.id\": null, '\n",
+      "                        '\"pull_request.assignee.type\": null, '\n",
+      "                        '\"pull_request.assignee.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.id\": null, '\n",
+      "                        '\"pull_request.milestone.number\": null, '\n",
+      "                        '\"pull_request.milestone.title\": null, '\n",
+      "                        '\"pull_request.milestone.description\": null, '\n",
+      "                        '\"pull_request.milestone.creator.login\": null, '\n",
+      "                        '\"pull_request.milestone.creator.id\": null, '\n",
+      "                        '\"pull_request.milestone.creator.type\": null, '\n",
+      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.open_issues\": null, '\n",
+      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
+      "                        '\"pull_request.milestone.state\": null, '\n",
+      "                        '\"pull_request.milestone.created_at\": null, '\n",
+      "                        '\"pull_request.milestone.updated_at\": null, '\n",
+      "                        '\"pull_request.milestone.due_on\": null, '\n",
+      "                        '\"pull_request.milestone.closed_at\": null, '\n",
+      "                        '\"pull_request.merged\": true, '\n",
+      "                        '\"pull_request.mergeable\": null, '\n",
+      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
+      "                        '\"pull_request.merged_by.login\": \"sxa\", '\n",
+      "                        '\"pull_request.merged_by.id\": 6487691, '\n",
+      "                        '\"pull_request.merged_by.type\": \"User\", '\n",
+      "                        '\"pull_request.merged_by.site_admin\": false, '\n",
+      "                        '\"pull_request.comments\": 0, '\n",
+      "                        '\"pull_request.review_comments\": 0, '\n",
+      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
+      "                        '4, \"pull_request.deletions\": 6, '\n",
+      "                        '\"pull_request.changed_files\": 1, '\n",
+      "                        '\"pull_request.label.id\": null, '\n",
+      "                        '\"pull_request.label.name\": null, '\n",
+      "                        '\"pull_request.label.color\": null, '\n",
+      "                        '\"pull_request.label.default\": null, '\n",
+      "                        '\"pull_request.head.label\": \"M-Davies:revert\", '\n",
+      "                        '\"pull_request.head.ref\": \"revert\", '\n",
+      "                        '\"pull_request.head.sha\": '\n",
+      "                        '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n",
+      "                        '\"pull_request.head.user.login\": \"M-Davies\", '\n",
+      "                        '\"pull_request.head.user.type\": \"User\", '\n",
+      "                        '\"pull_request.head.repo.name\": \"openjdk-build\", '\n",
+      "                        '\"pull_request.head.repo.full_name\": '\n",
+      "                        '\"M-Davies/openjdk-build\", '\n",
+      "                        '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n",
+      "                        '\"pull_request.head.repo.owner.type\": \"User\", '\n",
+      "                        '\"pull_request.head.repo.private\": false, '\n",
+      "                        '\"pull_request.head.repo.homepage\": \"\", '\n",
+      "                        '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n",
+      "                        'community OpenJDK build scripts - common across all '\n",
+      "                        'releases/versions\", \"pull_request.head.repo.fork\": '\n",
+      "                        'true, \"pull_request.head.repo.created_at\": '\n",
+      "                        '\"2019-11-29T09:24:43Z\", '\n",
+      "                        '\"pull_request.head.repo.updated_at\": '\n",
+      "                        '\"2020-05-27T14:45:16Z\", '\n",
+      "                        '\"pull_request.head.repo.pushed_at\": '\n",
+      "                        '\"2020-05-28T09:46:04Z\", '\n",
+      "                        '\"pull_request.head.repo.size\": 2383, '\n",
+      "                        '\"pull_request.head.repo.stargazers_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.watchers_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.language\": \"Shell\", '\n",
+      "                        '\"pull_request.head.repo.has_issues\": false, '\n",
+      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.head.repo.forks_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.archived\": false, '\n",
+      "                        '\"pull_request.head.repo.disabled\": false, '\n",
+      "                        '\"pull_request.head.repo.open_issues_count\": 0, '\n",
+      "                        '\"pull_request.head.repo.forks\": 0, '\n",
+      "                        '\"pull_request.head.repo.open_issues\": 0, '\n",
+      "                        '\"pull_request.head.repo.watchers\": 0, '\n",
+      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n",
+      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
+      "                        '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n",
+      "                        '\"Apache License 2.0\", \"pull_request.base.label\": '\n",
+      "                        '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n",
+      "                        '\"master\", \"pull_request.base.sha\": '\n",
+      "                        '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n",
+      "                        '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n",
+      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.name\": \"openjdk-build\", '\n",
+      "                        '\"pull_request.base.repo.full_name\": '\n",
+      "                        '\"AdoptOpenJDK/openjdk-build\", '\n",
+      "                        '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n",
+      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.private\": false, '\n",
+      "                        '\"pull_request.base.repo.homepage\": \"\", '\n",
+      "                        '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n",
+      "                        'community OpenJDK build scripts - common across all '\n",
+      "                        'releases/versions\", \"pull_request.base.repo.fork\": '\n",
+      "                        'false, \"pull_request.base.repo.created_at\": '\n",
+      "                        '\"2017-03-17T09:31:50Z\", '\n",
+      "                        '\"pull_request.base.repo.updated_at\": '\n",
+      "                        '\"2020-05-28T07:45:12Z\", '\n",
+      "                        '\"pull_request.base.repo.pushed_at\": '\n",
+      "                        '\"2020-05-28T09:51:48Z\", '\n",
+      "                        '\"pull_request.base.repo.size\": 2234, '\n",
+      "                        '\"pull_request.base.repo.stargazers_count\": 620, '\n",
+      "                        '\"pull_request.base.repo.watchers_count\": 620, '\n",
+      "                        '\"pull_request.base.repo.language\": \"Shell\", '\n",
+      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
+      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
+      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.base.repo.forks_count\": 137, '\n",
+      "                        '\"pull_request.base.repo.archived\": false, '\n",
+      "                        '\"pull_request.base.repo.disabled\": false, '\n",
+      "                        '\"pull_request.base.repo.open_issues_count\": 165, '\n",
+      "                        '\"pull_request.base.repo.forks\": 137, '\n",
+      "                        '\"pull_request.base.repo.open_issues\": 165, '\n",
+      "                        '\"pull_request.base.repo.watchers\": 620, '\n",
+      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n",
+      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
+      "                        '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n",
+      "                        '\"Apache License 2.0\", \"pull_request.guid\": '\n",
+      "                        '\"AdoptOpenJDK/openjdk-build/pull/1787\"}]',\n",
+      " 'pull_request.guid': 'AdoptOpenJDK/openjdk-build/pull/1787',\n",
+      " 'pull_request.issue_events': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pprint import pprint\n",
+    "pprint(small_ds[50])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 151,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'action': 'opened',\n",
+      "  'actor.id': 25231953,\n",
+      "  'actor.login': 'M-Davies',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2020, 5, 28, 9, 45, 30, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': False,\n",
+      "  'pull_request.merged_by.login': None,\n",
+      "  'pull_request.merged_by.type': None,\n",
+      "  'pull_request.state': 'open',\n",
+      "  'review.author_association': None,\n",
+      "  'review.body': None,\n",
+      "  'review.commit_id': None,\n",
+      "  'review.id': None,\n",
+      "  'review.state': None,\n",
+      "  'review.submitted_at': None,\n",
+      "  'type': 'PullRequestEvent',\n",
+      "  'user.login': None,\n",
+      "  'user.type': None},\n",
+      " {'action': 'closed',\n",
+      "  'actor.id': 6487691,\n",
+      "  'actor.login': 'sxa',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2020, 5, 28, 9, 51, 49, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': True,\n",
+      "  'pull_request.merged_by.login': 'sxa',\n",
+      "  'pull_request.merged_by.type': 'User',\n",
+      "  'pull_request.state': 'closed',\n",
+      "  'review.author_association': None,\n",
+      "  'review.body': None,\n",
+      "  'review.commit_id': None,\n",
+      "  'review.id': None,\n",
+      "  'review.state': None,\n",
+      "  'review.submitted_at': None,\n",
+      "  'type': 'PullRequestEvent',\n",
+      "  'user.login': None,\n",
+      "  'user.type': None}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(merged_ds[50][\"events\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 222,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Found cached dataset parquet (/Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--code_reviews_3-c3e4ac735edf14b4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds = load_dataset(\"loubnabnl/code_reviews_3\", split=\"train\")\n",
+    "size = len(ds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 223,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = ds[1470]\n",
+    "events = sample[\"events\"]\n",
+    "grouped_events = create_grouped_events(events)\n",
+    "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pprint import pprint\n",
+    "\n",
+    "pprint(small_ds[50])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 224,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "📝 **Title**: Fix @inheritDocs behavior<br>\n",
+       "    📦 **GitHub Repo**: Azure/azure-sdk-for-java, PR Number: 26816, ID: 836647691.<br>\n",
+       "    Link: [https://github.com/Azure/azure-sdk-for-java/pull/26816](https://github.com/Azure/azure-sdk-for-java/pull/26816)"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <table style=\"width:100%\">\n",
+       "        <tr><th>Attribute</th><th>Detail</th></tr>\n",
+       "        <tr><td>🧾 <strong>PR Type</strong></td><td>issue</td></tr>\n",
+       "        <tr><td>🟢 <strong>PR State</strong></td><td>open</td></tr>\n",
+       "        <tr><td>👤 <strong>PR Author</strong></td><td>kasobol-msft</td></tr>\n",
+       "        <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: kasobol-msft-patch-1, label: Azure:kasobol-msft-patch-1</td></tr>\n",
+       "        <tr><td>🌳 <strong>Base Branch</strong></td><td>main</td></tr>\n",
+       "    </table>\n",
+       "    "
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "Make sure that dependency sources are included in javadoc generation.\r\n",
+       "\r\n",
+       "Fixes https://github.com/Azure/azure-sdk-for-java/issues/26814"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def get_pr_info(sample):\n",
+    "    pr_info = sample[\"pull_request_info\"]\n",
+    "    head_info = sample[\"head_repo_info\"]\n",
+    "    base_info = sample[\"base_repo_info\"]\n",
+    "    events = sample[\"events\"]\n",
+    "\n",
+    "    gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
+    "\n",
+    "    header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}<br>\n",
+    "    📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.<br>\n",
+    "    Link: [{gh_link}]({gh_link})\"\"\"\n",
+    "    pr_info_html = f\"\"\"\n",
+    "    <table style=\"width:100%\">\n",
+    "        <tr><th>Attribute</th><th>Detail</th></tr>\n",
+    "        <tr><td>🧾 <strong>PR Type</strong></td><td>{events[0]['type']}</td></tr>\n",
+    "        <tr><td>🟢 <strong>PR State</strong></td><td>{pr_info['pull_request.state']}</td></tr>\n",
+    "        <tr><td>👤 <strong>PR Author</strong></td><td>{pr_info['pull_request.user.login']}</td></tr>\n",
+    "        <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}</td></tr>\n",
+    "        <tr><td>🌳 <strong>Base Branch</strong></td><td>{base_info['pull_request.base.ref']}</td></tr>\n",
+    "    </table>\n",
+    "    \"\"\"\n",
+    "    return header, pr_info_html, pr_info['pull_request.body']\n",
+    "\n",
+    "from IPython.display import HTML, display\n",
+    "display(HTML(get_pr_info(sample)[0]))\n",
+    "display(HTML(get_pr_info(sample)[1]))\n",
+    "display(HTML(get_pr_info(sample)[2]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 308,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = ds[4]\n",
+    "events = sample[\"events\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 309,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 309,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 310,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'action': 'opened',\n",
+      "  'actor.id': 39814207,\n",
+      "  'actor.login': 'pull[bot]',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2022, 10, 10, 10, 57, 41, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': False,\n",
+      "  'pull_request.merged_by.login': None,\n",
+      "  'pull_request.merged_by.type': None,\n",
+      "  'pull_request.state': 'open',\n",
+      "  'review.author_association': None,\n",
+      "  'review.body': None,\n",
+      "  'review.commit_id': None,\n",
+      "  'review.id': None,\n",
+      "  'review.state': None,\n",
+      "  'review.submitted_at': None,\n",
+      "  'type': 'PullRequestEvent',\n",
+      "  'user.login': None,\n",
+      "  'user.type': None},\n",
+      " {'action': 'closed',\n",
+      "  'actor.id': 39814207,\n",
+      "  'actor.login': 'pull[bot]',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2022, 10, 10, 11, 1, 28, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': True,\n",
+      "  'pull_request.merged_by.login': 'pull[bot]',\n",
+      "  'pull_request.merged_by.type': 'Bot',\n",
+      "  'pull_request.state': 'closed',\n",
+      "  'review.author_association': None,\n",
+      "  'review.body': None,\n",
+      "  'review.commit_id': None,\n",
+      "  'review.id': None,\n",
+      "  'review.state': None,\n",
+      "  'review.submitted_at': None,\n",
+      "  'type': 'PullRequestEvent',\n",
+      "  'user.login': None,\n",
+      "  'user.type': None}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import uuid\n",
+    "\n",
+    "def create_grouped_events(events):\n",
+    "    df = pd.DataFrame(events)\n",
+    "    # Ensure it's in datetime format\n",
+    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "    # Create a new column 'uuid' initialized with None\n",
+    "    df['uuid'] = None\n",
+    "    # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n",
+    "    mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n",
+    "    df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n",
+    "    # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n",
+    "    grouped_events = [group.drop(columns='uuid').to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n",
+    "    return grouped_events\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 229,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "846\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(len(ds)):\n",
+    "    e = ds[i]\n",
+    "    if e[\"events\"][0][\"comment.diff_hunk\"]:\n",
+    "        print(i)\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 299,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'action': 'opened',\n",
+      "  'actor.id': 1753262,\n",
+      "  'actor.login': 'mo9a7i',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': False,\n",
+      "  'pull_request.merged_by.login': None,\n",
+      "  'pull_request.merged_by.type': None,\n",
+      "  'pull_request.state': 'open',\n",
+      "  'review.author_association': None,\n",
+      "  'review.body': None,\n",
+      "  'review.commit_id': None,\n",
+      "  'review.id': None,\n",
+      "  'review.state': None,\n",
+      "  'review.submitted_at': None,\n",
+      "  'type': 'PullRequestEvent',\n",
+      "  'user.login': None,\n",
+      "  'user.type': None},\n",
+      " {'action': 'created',\n",
+      "  'actor.id': 1753262,\n",
+      "  'actor.login': 'mo9a7i',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': None,\n",
+      "  'pull_request.merged_by.login': None,\n",
+      "  'pull_request.merged_by.type': None,\n",
+      "  'pull_request.state': 'open',\n",
+      "  'review.author_association': 'MEMBER',\n",
+      "  'review.body': 'looks fine',\n",
+      "  'review.commit_id': 'ba75444d1ada77cf5f3f06cd74b6320bab8db54b',\n",
+      "  'review.id': 962846794,\n",
+      "  'review.state': 'commented',\n",
+      "  'review.submitted_at': '2022-05-05T04:35:02Z',\n",
+      "  'type': 'PullRequestReviewEvent',\n",
+      "  'user.login': 'mo9a7i',\n",
+      "  'user.type': 'User'},\n",
+      " {'action': 'closed',\n",
+      "  'actor.id': 1753262,\n",
+      "  'actor.login': 'mo9a7i',\n",
+      "  'comment.author_association': None,\n",
+      "  'comment.body': None,\n",
+      "  'comment.commit_id': None,\n",
+      "  'comment.created_at': None,\n",
+      "  'comment.diff_hunk': None,\n",
+      "  'comment.id': None,\n",
+      "  'comment.in_reply_to_id': None,\n",
+      "  'comment.line': None,\n",
+      "  'comment.original_commit_id': None,\n",
+      "  'comment.original_line': None,\n",
+      "  'comment.original_position': None,\n",
+      "  'comment.original_start_line': None,\n",
+      "  'comment.path': None,\n",
+      "  'comment.position': None,\n",
+      "  'comment.side': None,\n",
+      "  'comment.start_line': None,\n",
+      "  'comment.start_side': None,\n",
+      "  'comment.updated_at': None,\n",
+      "  'created_at': datetime.datetime(2022, 5, 5, 4, 35, 3, tzinfo=<UTC>),\n",
+      "  'issue.author': None,\n",
+      "  'issue.comment': None,\n",
+      "  'issue.comment_id': None,\n",
+      "  'pull_request.merged': True,\n",
+      "  'pull_request.merged_by.login': 'mo9a7i',\n",
+      "  'pull_request.merged_by.type': 'User',\n",
+      "  'pull_request.state': 'closed',\n",
+      "  'review.author_association': None,\n",
+      "  'review.body': None,\n",
+      "  'review.commit_id': None,\n",
+      "  'review.id': None,\n",
+      "  'review.state': None,\n",
+      "  'review.submitted_at': None,\n",
+      "  'type': 'PullRequestEvent',\n",
+      "  'user.login': None,\n",
+      "  'user.type': None}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 303,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>action</th>\n",
+       "      <th>actor.id</th>\n",
+       "      <th>actor.login</th>\n",
+       "      <th>comment.author_association</th>\n",
+       "      <th>comment.body</th>\n",
+       "      <th>comment.commit_id</th>\n",
+       "      <th>comment.created_at</th>\n",
+       "      <th>comment.diff_hunk</th>\n",
+       "      <th>comment.id</th>\n",
+       "      <th>comment.in_reply_to_id</th>\n",
+       "      <th>...</th>\n",
+       "      <th>review.author_association</th>\n",
+       "      <th>review.body</th>\n",
+       "      <th>review.commit_id</th>\n",
+       "      <th>review.id</th>\n",
+       "      <th>review.state</th>\n",
+       "      <th>review.submitted_at</th>\n",
+       "      <th>type</th>\n",
+       "      <th>user.login</th>\n",
+       "      <th>user.type</th>\n",
+       "      <th>group_key</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>opened</td>\n",
+       "      <td>1753262</td>\n",
+       "      <td>mo9a7i</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>PullRequestEvent</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>created</td>\n",
+       "      <td>1753262</td>\n",
+       "      <td>mo9a7i</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>MEMBER</td>\n",
+       "      <td>looks fine</td>\n",
+       "      <td>ba75444d1ada77cf5f3f06cd74b6320bab8db54b</td>\n",
+       "      <td>962846794.0</td>\n",
+       "      <td>commented</td>\n",
+       "      <td>2022-05-05T04:35:02Z</td>\n",
+       "      <td>PullRequestReviewEvent</td>\n",
+       "      <td>mo9a7i</td>\n",
+       "      <td>User</td>\n",
+       "      <td>ba75444d1ada77cf5f3f06cd74b6320bab8db54b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>closed</td>\n",
+       "      <td>1753262</td>\n",
+       "      <td>mo9a7i</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>PullRequestEvent</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>2.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3 rows × 39 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    action  actor.id actor.login comment.author_association comment.body   \n",
+       "0   opened   1753262      mo9a7i                       None         None  \\\n",
+       "1  created   1753262      mo9a7i                       None         None   \n",
+       "2   closed   1753262      mo9a7i                       None         None   \n",
+       "\n",
+       "  comment.commit_id comment.created_at comment.diff_hunk comment.id   \n",
+       "0              None               None              None       None  \\\n",
+       "1              None               None              None       None   \n",
+       "2              None               None              None       None   \n",
+       "\n",
+       "  comment.in_reply_to_id  ... review.author_association review.body   \n",
+       "0                   None  ...                      None        None  \\\n",
+       "1                   None  ...                    MEMBER  looks fine   \n",
+       "2                   None  ...                      None        None   \n",
+       "\n",
+       "                           review.commit_id    review.id review.state   \n",
+       "0                                      None          NaN         None  \\\n",
+       "1  ba75444d1ada77cf5f3f06cd74b6320bab8db54b  962846794.0    commented   \n",
+       "2                                      None          NaN         None   \n",
+       "\n",
+       "    review.submitted_at                    type user.login user.type   \n",
+       "0                  None        PullRequestEvent       None      None  \\\n",
+       "1  2022-05-05T04:35:02Z  PullRequestReviewEvent     mo9a7i      User   \n",
+       "2                  None        PullRequestEvent       None      None   \n",
+       "\n",
+       "                                  group_key  \n",
+       "0                                       1.0  \n",
+       "1  ba75444d1ada77cf5f3f06cd74b6320bab8db54b  \n",
+       "2                                       2.0  \n",
+       "\n",
+       "[3 rows x 39 columns]"
+      ]
+     },
+     "execution_count": 303,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "df = pd.DataFrame(events)\n",
+    "df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "df.drop_duplicates(inplace=True)\n",
+    "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n",
+    "mask = df['review.commit_id'].isnull()\n",
+    "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n",
+    "df.loc[~mask, 'group_key'] = df['review.commit_id']\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 304,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "df = pd.DataFrame(events)\n",
+    "df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "df.drop_duplicates(inplace=True)\n",
+    "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n",
+    "mask = df['review.commit_id'].isnull()\n",
+    "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n",
+    "df.loc[~mask, 'group_key'] = df['review.commit_id']\n",
+    "\n",
+    "if len(df) == 1:\n",
+    "    grouped_events = [[df.iloc[0].to_dict()]]\n",
+    "else:\n",
+    "    grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n",
+    "\n",
+    "# sort by first event date\n",
+    "grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 311,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_grouped_events(events):\n",
+    "    \"\"\"group events that happened in the same  review thread using review.commit_id\"\"\"\n",
+    "    df = pd.DataFrame(events)\n",
+    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "    df.drop_duplicates(inplace=True)\n",
+    "    # Create a new 'group_key' where rows with NaN 'review.commit_id' get an identical identifier. Otherwise NaN values go in the same group\n",
+    "    mask = df['review.commit_id'].isnull()\n",
+    "    df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n",
+    "    df.loc[~mask, 'group_key'] = df['review.commit_id']\n",
+    "    \n",
+    "    if len(df) == 1:\n",
+    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
+    "    else:\n",
+    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n",
+    "    \n",
+    "    # sort by first event date\n",
+    "    grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n",
+    "    return grouped_events\n",
+    "\n",
+    "grouped_events = create_grouped_events(events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 312,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "len events 2 and len grouped_events 2\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"len events {len(events)} and len grouped_events {len(grouped_events)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 313,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "thread number 0\n",
+      "thread number 1\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"thread\">\n",
+       "    <div class=\"event\">\n",
+       "        <table style=\"width:100%; border: 0;\">\n",
+       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "            <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
+       "            <tr><td><strong>Action</strong></td><td style=\"\">opened</td></tr>\n",
+       "            \"<tr><td><strong>Review State</strong></td><td>None</td></tr>\"\n",
+       "            <tr><td><strong>PR State</strong></td><td style=\"\">open, merged: False</td></tr>\n",
+       "            <tr><td><strong>Date</strong></td><td>2022-10-10 10:57:41+00:00</td></tr>\n",
+       "        </table>\n",
+       "    </div>\n",
+       "    \n",
+       "---------------------------------------------------------------------------------------------------------------------------------------------------------------------</div><div class=\"thread\">\n",
+       "    <div class=\"event\">\n",
+       "        <table style=\"width:100%; border: 0;\">\n",
+       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "            <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
+       "            <tr><td><strong>Action</strong></td><td style=\"background-color: #FFCFCF;\">closed</td></tr>\n",
+       "            \"<tr><td><strong>Review State</strong></td><td>None</td></tr>\"\n",
+       "            <tr><td><strong>PR State</strong></td><td style=\"background-color: #FFCFCF;\">closed, merged: True</td></tr>\n",
+       "            <tr><td><strong>Date</strong></td><td>2022-10-10 11:01:28+00:00</td></tr>\n",
+       "        </table>\n",
+       "    </div>\n",
+       "    \n",
+       "---------------------------------------------------------------------------------------------------------------------------------------------------------------------</div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "\n",
+    "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
+    "thread_html = \"\"\n",
+    "c = 0\n",
+    "for thread in grouped_events:\n",
+    "    print(f\"thread number {c}\")\n",
+    "    c += 1\n",
+    "    thread_html += '<div class=\"thread\">'\n",
+    "    # Get the first event in the thread as a reference\n",
+    "    first_event = thread[0]\n",
+    "    poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n",
+    "    # Add shared parts of the events only once\n",
+    "    user_type = f\"(type :<strong>{first_event['user.type']}</strong>)\" if first_event['user.type'] else \"\"\n",
+    "    review_state = f\"<tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\" if first_event['review.state'] else \"\"\n",
+    "    text = f\"\"\"\n",
+    "    <div class=\"event\">\n",
+    "        <table style=\"width:100%; border: 0;\">\n",
+    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
+    "            <tr><td><strong>User</strong></td><td>{poster_name} {user_type}</td></tr>\n",
+    "            <tr><td><strong>Action</strong></td><td>{first_event['action']}</td></tr>\n",
+    "            {review_state}\n",
+    "            <tr><td><strong>PR State</strong></td><td>{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
+    "            <tr><td><strong>Date</strong></td><td>{first_event['created_at']}</td></tr>\n",
+    "        </table>\n",
+    "    </div>\n",
+    "    \"\"\"\n",
+    "    highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n",
+    "    highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n",
+    "\n",
+    "    text = f\"\"\"\n",
+    "    <div class=\"event\">\n",
+    "        <table style=\"width:100%; border: 0;\">\n",
+    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
+    "            <tr><td><strong>User</strong></td><td>{poster_name} {user_type}</td></tr>\n",
+    "            <tr><td><strong>Action</strong></td><td style=\"{highlight_action}\">{first_event['action']}</td></tr>\n",
+    "            \"<tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\"\n",
+    "            <tr><td><strong>PR State</strong></td><td style=\"{highlight_pr_state}\">{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
+    "            <tr><td><strong>Date</strong></td><td>{first_event['created_at']}</td></tr>\n",
+    "        </table>\n",
+    "    </div>\n",
+    "    \"\"\"\n",
+    "\n",
+    "\n",
+    "    thread_html += text\n",
+    "    thread_html += (\"\\n\" + \"-\"*165)\n",
+    "    # Add the bodies of the comments for each event in the thread\n",
+    "    for event in thread:\n",
+    "        # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n",
+    "        poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n",
+    "        if event['comment.body'] or event[\"issue.comment\"]:\n",
+    "            is_op = original_poster == poster_name\n",
+    "            thread_html += format_body(event['comment.body'], poster_name, is_op)\n",
+    "\n",
+    "    thread_html += '</div>'\n",
+    "\n",
+    "display(HTML(thread_html))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 314,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def display_events(sample):\n",
+    "    events = sample[\"events\"]\n",
+    "    grouped_events = create_grouped_events(events)\n",
+    "    original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
+    "    for thread in grouped_events:\n",
+    "        thread_html = '<div class=\"thread\">'\n",
+    "        # Get the first event in the thread as a reference\n",
+    "        first_event = thread[0]\n",
+    "        poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n",
+    "        # Add shared parts of the events only once\n",
+    "        user_type = f\"(type :<strong>{first_event['user.type']}</strong>)\" if first_event['user.type'] else \"\"\n",
+    "        highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n",
+    "        highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n",
+    "    \n",
+    "        text = f\"\"\"\n",
+    "        <div class=\"event\">\n",
+    "            <table style=\"width:100%; border: 0;\">\n",
+    "                <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
+    "                <tr><td><strong>User</strong></td><td>{poster_name} {user_type}</td></tr>\n",
+    "                <tr><td><strong>Action</strong></td><td style='{highlight_action}'>{first_event['action']}</td></tr>\n",
+    "                <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
+    "                <tr><td><strong>PR State</strong></td><td style=\"{highlight_pr_state}\">{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
+    "                <tr><td><strong>Date</strong></td><td>{first_event['created_at']}</td></tr>\n",
+    "            </table>\n",
+    "        </div>\n",
+    "        \"\"\"\n",
+    "        print(f\"added first event of teh group\")\n",
+    "        thread_html += text\n",
+    "        \n",
+    "        # Add the bodies of the comments for each event in the thread\n",
+    "        for event in thread:\n",
+    "            # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n",
+    "            poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n",
+    "            if event['comment.body'] or event[\"issue.comment\"]:\n",
+    "                is_op = original_poster == poster_name\n",
+    "                thread_html += format_body(event['comment.body'], poster_name, is_op)\n",
+    "\n",
+    "        thread_html += '</div>'\n",
+    "        display(HTML(thread_html))\n",
+    "        if first_event['comment.path']:\n",
+    "            path_html = f\"<code style='font-family: monospace; font-weight: bold;'>Path:</code> {first_event['comment.path']}\"\n",
+    "            display(HTML(path_html))\n",
+    "        display(HTML(\"---\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 316,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[[{'action': 'opened',\n",
+       "   'actor.id': 39814207,\n",
+       "   'actor.login': 'pull[bot]',\n",
+       "   'comment.author_association': None,\n",
+       "   'comment.body': None,\n",
+       "   'comment.commit_id': None,\n",
+       "   'comment.created_at': None,\n",
+       "   'comment.diff_hunk': None,\n",
+       "   'comment.id': None,\n",
+       "   'comment.in_reply_to_id': None,\n",
+       "   'comment.line': None,\n",
+       "   'comment.original_commit_id': None,\n",
+       "   'comment.original_line': None,\n",
+       "   'comment.original_position': None,\n",
+       "   'comment.original_start_line': None,\n",
+       "   'comment.path': None,\n",
+       "   'comment.position': None,\n",
+       "   'comment.side': None,\n",
+       "   'comment.start_line': None,\n",
+       "   'comment.start_side': None,\n",
+       "   'comment.updated_at': None,\n",
+       "   'created_at': Timestamp('2022-10-10 10:57:41+0000', tz='UTC'),\n",
+       "   'issue.author': None,\n",
+       "   'issue.comment': None,\n",
+       "   'issue.comment_id': None,\n",
+       "   'pull_request.merged': False,\n",
+       "   'pull_request.merged_by.login': None,\n",
+       "   'pull_request.merged_by.type': None,\n",
+       "   'pull_request.state': 'open',\n",
+       "   'review.author_association': None,\n",
+       "   'review.body': None,\n",
+       "   'review.commit_id': None,\n",
+       "   'review.id': None,\n",
+       "   'review.state': None,\n",
+       "   'review.submitted_at': None,\n",
+       "   'type': 'PullRequestEvent',\n",
+       "   'user.login': None,\n",
+       "   'user.type': None,\n",
+       "   'group_key': 1.0}],\n",
+       " [{'action': 'closed',\n",
+       "   'actor.id': 39814207,\n",
+       "   'actor.login': 'pull[bot]',\n",
+       "   'comment.author_association': None,\n",
+       "   'comment.body': None,\n",
+       "   'comment.commit_id': None,\n",
+       "   'comment.created_at': None,\n",
+       "   'comment.diff_hunk': None,\n",
+       "   'comment.id': None,\n",
+       "   'comment.in_reply_to_id': None,\n",
+       "   'comment.line': None,\n",
+       "   'comment.original_commit_id': None,\n",
+       "   'comment.original_line': None,\n",
+       "   'comment.original_position': None,\n",
+       "   'comment.original_start_line': None,\n",
+       "   'comment.path': None,\n",
+       "   'comment.position': None,\n",
+       "   'comment.side': None,\n",
+       "   'comment.start_line': None,\n",
+       "   'comment.start_side': None,\n",
+       "   'comment.updated_at': None,\n",
+       "   'created_at': Timestamp('2022-10-10 11:01:28+0000', tz='UTC'),\n",
+       "   'issue.author': None,\n",
+       "   'issue.comment': None,\n",
+       "   'issue.comment_id': None,\n",
+       "   'pull_request.merged': True,\n",
+       "   'pull_request.merged_by.login': 'pull[bot]',\n",
+       "   'pull_request.merged_by.type': 'Bot',\n",
+       "   'pull_request.state': 'closed',\n",
+       "   'review.author_association': None,\n",
+       "   'review.body': None,\n",
+       "   'review.commit_id': None,\n",
+       "   'review.id': None,\n",
+       "   'review.state': None,\n",
+       "   'review.submitted_at': None,\n",
+       "   'type': 'PullRequestEvent',\n",
+       "   'user.login': None,\n",
+       "   'user.type': None,\n",
+       "   'group_key': 2.0}]]"
+      ]
+     },
+     "execution_count": 316,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "grouped_events"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 315,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"thread\">\n",
+       "        <div class=\"event\">\n",
+       "            <table style=\"width:100%; border: 0;\">\n",
+       "                <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "                <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
+       "                <tr><td><strong>Action</strong></td><td style=''>opened</td></tr>\n",
+       "                <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
+       "                <tr><td><strong>PR State</strong></td><td style=\"\">open, merged: False</td></tr>\n",
+       "                <tr><td><strong>Date</strong></td><td>2022-10-10 10:57:41+00:00</td></tr>\n",
+       "            </table>\n",
+       "        </div>\n",
+       "        </div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "---"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"thread\">\n",
+       "        <div class=\"event\">\n",
+       "            <table style=\"width:100%; border: 0;\">\n",
+       "                <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "                <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
+       "                <tr><td><strong>Action</strong></td><td style=''>opened</td></tr>\n",
+       "                <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
+       "                <tr><td><strong>PR State</strong></td><td style=\"\">open, merged: False</td></tr>\n",
+       "                <tr><td><strong>Date</strong></td><td>2022-10-10 10:57:41+00:00</td></tr>\n",
+       "            </table>\n",
+       "        </div>\n",
+       "        </div><div class=\"thread\">\n",
+       "        <div class=\"event\">\n",
+       "            <table style=\"width:100%; border: 0;\">\n",
+       "                <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "                <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
+       "                <tr><td><strong>Action</strong></td><td style='background-color: #FFCFCF;'>closed</td></tr>\n",
+       "                <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
+       "                <tr><td><strong>PR State</strong></td><td style=\"background-color: #FFCFCF;\">closed, merged: True</td></tr>\n",
+       "                <tr><td><strong>Date</strong></td><td>2022-10-10 11:01:28+00:00</td></tr>\n",
+       "            </table>\n",
+       "        </div>\n",
+       "        </div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "---"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display_events(sample)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 261,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "single\n",
+      "2022-05-05 04:35:02+00:00\n",
+      "with review state: commented\n",
+      "None\n",
+      "User: mo9a7i, action: created\n",
+      "PullRequestReviewEvent\n",
+      "------------\n",
+      "multiple\n",
+      "single\n",
+      "2022-05-05 04:35:02+00:00\n",
+      "with review state: None\n",
+      "None\n",
+      "User: mo9a7i, action: opened\n",
+      "PullRequestEvent\n",
+      "------------\n",
+      "------------\n",
+      "2022-05-05 04:35:02+00:00\n",
+      "with review state: None PR state False\n",
+      "None\n",
+      "User: mo9a7i, action: closed\n",
+      "PullRequestEvent\n",
+      "------------\n",
+      "------------end multiple\n"
+     ]
+    }
+   ],
+   "source": [
+    "for group in grouped_events:\n",
+    "    if len(group) == 1:\n",
+    "        poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n",
+    "        print(\"single\")\n",
+    "        print(group[0][\"created_at\"])\n",
+    "        print(f\"with review state: {group[0]['review.state']}\")\n",
+    "        print(group[0][\"comment.body\"])\n",
+    "        # print action type and user\n",
+    "        print(f\"User: {poster_name}, action: {group[0]['action']}\")\n",
+    "        print(group[0][\"type\"])\n",
+    "        print(\"------------\")\n",
+    "        continue\n",
+    "    # date \n",
+    "    else:\n",
+    "        print(\"multiple\")\n",
+    "        poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n",
+    "        print(\"single\")\n",
+    "        print(group[0][\"created_at\"])\n",
+    "        print(f\"with review state: {group[0]['review.state']}\")\n",
+    "        print(group[0][\"comment.body\"])\n",
+    "        # print action type and user\n",
+    "        print(f\"User: {poster_name}, action: {group[0]['action']}\")\n",
+    "        print(group[0][\"type\"])\n",
+    "        print(\"------------\")\n",
+    "        print(\"------------\")\n",
+    "        for e in group[1:]:\n",
+    "            print(group[0][\"created_at\"])\n",
+    "            print(f\"with review state: {group[0]['review.state']} PR state {group[0]['pull_request.merged']}\")\n",
+    "            print(e[\"comment.body\"])\n",
+    "            poster_name = e['actor.login'] or e['issue.author'] or e['user.login']\n",
+    "            print(f\"User: {poster_name}, action: {e['action']}\")\n",
+    "            print(e[\"type\"])\n",
+    "            print(\"------------\")\n",
+    "        print(\"------------end multiple\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 225,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_grouped_events(events):\n",
+    "    df = pd.DataFrame(events)\n",
+    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "    df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n",
+    "    # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n",
+    "    if len(df) == 1:\n",
+    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
+    "    else:\n",
+    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'pull_request.state'], dropna=False)]\n",
+    "    return grouped_events\n",
+    "\n",
+    "def format_body(text, user, is_op=False):\n",
+    "    color = \"#007bff\" if is_op else \"black\"\n",
+    "    pr_body = f\"<div style='background-color: #f0f0f0; padding: 10px;'>👤<strong style='color: {color};'>{user}</strong>: {text}</div>\"\n",
+    "    return pr_body"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 220,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import uuid\n",
+    "import pandas as pd\n",
+    "\n",
+    "def create_grouped_events(events):\n",
+    "    df = pd.DataFrame(events)\n",
+    "    \n",
+    "    # Ensure it's in datetime format\n",
+    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "    # Preserve the original order\n",
+    "    df['order'] = range(len(df))\n",
+    "\n",
+    "    # Create a new column 'uuid' initialized with None\n",
+    "    df['uuid'] = None\n",
+    "\n",
+    "    # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n",
+    "    mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n",
+    "    df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n",
+    "\n",
+    "    # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n",
+    "    grouped_events = [group.drop(columns=['uuid', 'order']).to_dict(orient='records') \n",
+    "                      for _, group in df.sort_values(by='order').groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n",
+    "    # soert on created_at\n",
+    "    grouped_events = [sorted(group, key=lambda x: x['created_at']) for group in grouped_events]\n",
+    "    return grouped_events\n",
+    "\n",
+    "\n",
+    "\n",
+    "grouped_events = create_grouped_events(events)\n",
+    "c = 0\n",
+    "thread_html = \"\"\n",
+    "for thread in grouped_events:\n",
+    "    # Start a new thread\n",
+    "    #print(thread)\n",
+    "    if thread[0][\"action\"] == \"opened\":\n",
+    "        continue\n",
+    "    thread_html += '<div class=\"thread\">'\n",
+    "    # Get the first event in the thread as a reference\n",
+    "    first_event = thread[0]\n",
+    "    poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n",
+    "    # Add shared parts of the events only once\n",
+    "    text = f\"\"\"\n",
+    "    <div class=\"event\">\n",
+    "        <table style=\"width:100%; border: 0;\">\n",
+    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
+    "            <tr><td><strong>User</strong></td><td>{poster_name} (type :<strong>{first_event['user.type']}</strong>)</td></tr>\n",
+    "            <tr><td><strong>Action</strong></td><td>{first_event['action']}</td></tr>\n",
+    "            <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
+    "            <tr><td><strong>PR State</strong></td><td>{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
+    "            <tr><td><strong>From Head</strong></td><td>{sample[\"head_repo_info\"]['pull_request.head.label']}</td></tr>\n",
+    "        </table>\n",
+    "    </div>\n",
+    "    \"\"\"\n",
+    "    thread_html += text\n",
+    "    # add horizontal line\n",
+    "    thread_html += '<hr>'\n",
+    "    for event in thread:\n",
+    "        # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n",
+    "        poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n",
+    "        if event['comment.body'] or event[\"issue.comment\"]:\n",
+    "            is_op = original_poster == poster_name\n",
+    "            thread_html += format_body(event['comment.body'], poster_name, is_op)\n",
+    "\n",
+    "    thread_html += '</div>'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 218,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 218,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(grouped_events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 221,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[{'action': 'created',\n",
+      "   'actor.id': nan,\n",
+      "   'actor.login': None,\n",
+      "   'comment.author_association': None,\n",
+      "   'comment.body': None,\n",
+      "   'comment.commit_id': None,\n",
+      "   'comment.created_at': None,\n",
+      "   'comment.diff_hunk': None,\n",
+      "   'comment.id': None,\n",
+      "   'comment.in_reply_to_id': None,\n",
+      "   'comment.line': None,\n",
+      "   'comment.original_commit_id': None,\n",
+      "   'comment.original_line': None,\n",
+      "   'comment.original_position': None,\n",
+      "   'comment.original_start_line': None,\n",
+      "   'comment.path': None,\n",
+      "   'comment.position': None,\n",
+      "   'comment.side': None,\n",
+      "   'comment.start_line': None,\n",
+      "   'comment.start_side': None,\n",
+      "   'comment.updated_at': None,\n",
+      "   'created_at': Timestamp('2022-02-01 00:05:19+0000', tz='UTC'),\n",
+      "   'issue.author': 'kasobol-msft',\n",
+      "   'issue.comment': \"This won't work well because it includes dependencies in \"\n",
+      "                    'output like this:\\r\\n'\n",
+      "                    '![image](https://user-images.githubusercontent.com/61715331/151893024-ef3e99d9-0d83-44c6-839b-966550320642.png)\\r\\n'\n",
+      "                    '\\r\\n'\n",
+      "                    \"There's hacky way to side step this:\\r\\n\"\n",
+      "                    '![image](https://user-images.githubusercontent.com/61715331/151893056-8d018cb9-2f0d-4c7d-8848-eb9df9028b88.png)\\r\\n'\n",
+      "                    '\\r\\n'\n",
+      "                    'But it would require be explicit about each dependency in '\n",
+      "                    'each sdk to be precise and not risk any \"dependency doc '\n",
+      "                    'leaks\".',\n",
+      "   'issue.comment_id': 1026335328.0,\n",
+      "   'pull_request.merged': None,\n",
+      "   'pull_request.merged_by.login': None,\n",
+      "   'pull_request.merged_by.type': None,\n",
+      "   'pull_request.state': None,\n",
+      "   'review.author_association': None,\n",
+      "   'review.body': None,\n",
+      "   'review.commit_id': None,\n",
+      "   'review.id': None,\n",
+      "   'review.state': None,\n",
+      "   'review.submitted_at': None,\n",
+      "   'type': 'comment',\n",
+      "   'user.login': None,\n",
+      "   'user.type': None}],\n",
+      " [{'action': 'opened',\n",
+      "   'actor.id': 61715331.0,\n",
+      "   'actor.login': 'kasobol-msft',\n",
+      "   'comment.author_association': None,\n",
+      "   'comment.body': None,\n",
+      "   'comment.commit_id': None,\n",
+      "   'comment.created_at': None,\n",
+      "   'comment.diff_hunk': None,\n",
+      "   'comment.id': None,\n",
+      "   'comment.in_reply_to_id': None,\n",
+      "   'comment.line': None,\n",
+      "   'comment.original_commit_id': None,\n",
+      "   'comment.original_line': None,\n",
+      "   'comment.original_position': None,\n",
+      "   'comment.original_start_line': None,\n",
+      "   'comment.path': None,\n",
+      "   'comment.position': None,\n",
+      "   'comment.side': None,\n",
+      "   'comment.start_line': None,\n",
+      "   'comment.start_side': None,\n",
+      "   'comment.updated_at': None,\n",
+      "   'created_at': Timestamp('2022-01-31 22:51:21+0000', tz='UTC'),\n",
+      "   'issue.author': None,\n",
+      "   'issue.comment': None,\n",
+      "   'issue.comment_id': nan,\n",
+      "   'pull_request.merged': False,\n",
+      "   'pull_request.merged_by.login': None,\n",
+      "   'pull_request.merged_by.type': None,\n",
+      "   'pull_request.state': 'open',\n",
+      "   'review.author_association': None,\n",
+      "   'review.body': None,\n",
+      "   'review.commit_id': None,\n",
+      "   'review.id': None,\n",
+      "   'review.state': None,\n",
+      "   'review.submitted_at': None,\n",
+      "   'type': 'PullRequestEvent',\n",
+      "   'user.login': None,\n",
+      "   'user.type': None}],\n",
+      " [{'action': 'opened',\n",
+      "   'actor.id': nan,\n",
+      "   'actor.login': None,\n",
+      "   'comment.author_association': None,\n",
+      "   'comment.body': None,\n",
+      "   'comment.commit_id': None,\n",
+      "   'comment.created_at': None,\n",
+      "   'comment.diff_hunk': None,\n",
+      "   'comment.id': None,\n",
+      "   'comment.in_reply_to_id': None,\n",
+      "   'comment.line': None,\n",
+      "   'comment.original_commit_id': None,\n",
+      "   'comment.original_line': None,\n",
+      "   'comment.original_position': None,\n",
+      "   'comment.original_start_line': None,\n",
+      "   'comment.path': None,\n",
+      "   'comment.position': None,\n",
+      "   'comment.side': None,\n",
+      "   'comment.start_line': None,\n",
+      "   'comment.start_side': None,\n",
+      "   'comment.updated_at': None,\n",
+      "   'created_at': Timestamp('2022-01-31 22:51:20+0000', tz='UTC'),\n",
+      "   'issue.author': 'kasobol-msft',\n",
+      "   'issue.comment': None,\n",
+      "   'issue.comment_id': nan,\n",
+      "   'pull_request.merged': None,\n",
+      "   'pull_request.merged_by.login': None,\n",
+      "   'pull_request.merged_by.type': None,\n",
+      "   'pull_request.state': None,\n",
+      "   'review.author_association': None,\n",
+      "   'review.body': None,\n",
+      "   'review.commit_id': None,\n",
+      "   'review.id': None,\n",
+      "   'review.state': None,\n",
+      "   'review.submitted_at': None,\n",
+      "   'type': 'issue',\n",
+      "   'user.login': None,\n",
+      "   'user.type': None}],\n",
+      " [{'action': 'closed',\n",
+      "   'actor.id': 61715331.0,\n",
+      "   'actor.login': 'kasobol-msft',\n",
+      "   'comment.author_association': None,\n",
+      "   'comment.body': None,\n",
+      "   'comment.commit_id': None,\n",
+      "   'comment.created_at': None,\n",
+      "   'comment.diff_hunk': None,\n",
+      "   'comment.id': None,\n",
+      "   'comment.in_reply_to_id': None,\n",
+      "   'comment.line': None,\n",
+      "   'comment.original_commit_id': None,\n",
+      "   'comment.original_line': None,\n",
+      "   'comment.original_position': None,\n",
+      "   'comment.original_start_line': None,\n",
+      "   'comment.path': None,\n",
+      "   'comment.position': None,\n",
+      "   'comment.side': None,\n",
+      "   'comment.start_line': None,\n",
+      "   'comment.start_side': None,\n",
+      "   'comment.updated_at': None,\n",
+      "   'created_at': Timestamp('2022-02-01 00:05:20+0000', tz='UTC'),\n",
+      "   'issue.author': None,\n",
+      "   'issue.comment': None,\n",
+      "   'issue.comment_id': nan,\n",
+      "   'pull_request.merged': False,\n",
+      "   'pull_request.merged_by.login': None,\n",
+      "   'pull_request.merged_by.type': None,\n",
+      "   'pull_request.state': 'closed',\n",
+      "   'review.author_association': None,\n",
+      "   'review.body': None,\n",
+      "   'review.commit_id': None,\n",
+      "   'review.id': None,\n",
+      "   'review.state': None,\n",
+      "   'review.submitted_at': None,\n",
+      "   'type': 'PullRequestEvent',\n",
+      "   'user.login': None,\n",
+      "   'user.type': None}]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(grouped_events)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 193,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"thread\">\n",
+       "    <div class=\"event\">\n",
+       "        <table style=\"width:100%; border: 0;\">\n",
+       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "            <tr><td><strong>User</strong></td><td>kasobol-msft (type :<strong>None</strong>)</td></tr>\n",
+       "            <tr><td><strong>Action</strong></td><td>closed</td></tr>\n",
+       "            <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
+       "            <tr><td><strong>PR State</strong></td><td>closed, merged: False</td></tr>\n",
+       "            <tr><td><strong>From Head</strong></td><td>Azure:kasobol-msft-patch-1</td></tr>\n",
+       "        </table>\n",
+       "    </div>\n",
+       "    <hr></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from IPython.display import HTML, display\n",
+    "display(HTML(thread_html))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n",
+       " 'pull_request.base.ref': 'master',\n",
+       " 'pull_request.base.repo.default_branch': 'master',\n",
+       " 'pull_request.base.repo.description': None,\n",
+       " 'pull_request.base.repo.forks_count': 0,\n",
+       " 'pull_request.base.repo.homepage': None,\n",
+       " 'pull_request.base.repo.language': 'TypeScript',\n",
+       " 'pull_request.base.repo.license.name': None,\n",
+       " 'pull_request.base.repo.name': 'WuzuufMasr',\n",
+       " 'pull_request.base.repo.open_issues_count': 24,\n",
+       " 'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n",
+       " 'pull_request.base.repo.owner.type': 'User',\n",
+       " 'pull_request.base.repo.private': False,\n",
+       " 'pull_request.base.repo.stargazers_count': 0,\n",
+       " 'pull_request.base.repo.watchers_count': 0,\n",
+       " 'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n",
+       " 'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n",
+       " 'pull_request.base.user.type': 'User',\n",
+       " 'pull_request.comments': 0,\n",
+       " 'pull_request.label.name': None,\n",
+       " 'pull_request.review_comments': 0}"
+      ]
+     },
+     "execution_count": 92,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged_ds[0][\"base_repo_info\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = merged_ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 321,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'__index_level_0__': 175,\n",
+      " 'bucket': '940',\n",
+      " 'pull_request.code_review_events': None,\n",
+      " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n",
+      "                        '\"actor.login\": \"pkarman\", \"actor.id\": 1205061, '\n",
+      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
+      "                        'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n",
+      "                        '\"public\": true, \"created_at\": \"2015-11-23T19:16:36Z\", '\n",
+      "                        '\"org.id\": 6233994, \"org.login\": \"18F\", '\n",
+      "                        '\"pull_request.id\": 51566831, \"pull_request.number\": '\n",
+      "                        '820, \"pull_request.state\": \"open\", '\n",
+      "                        '\"pull_request.title\": \"rename elk services to '\n",
+      "                        'workaround blue-green deploy bug\", '\n",
+      "                        '\"pull_request.body\": \"there\\'s a bug in the '\n",
+      "                        'cf-blue-green deploy that gets a false positive match '\n",
+      "                        'based on the current ELK naming convention. I have '\n",
+      "                        're-named all our ELK services to workaround that '\n",
+      "                        'bug.\", \"pull_request.user.login\": \"pkarman\", '\n",
+      "                        '\"pull_request.user.id\": 1205061, '\n",
+      "                        '\"pull_request.author_association\": null, '\n",
+      "                        '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n",
+      "                        '\"pull_request.updated_at\": \"2015-11-23T19:16:34Z\", '\n",
+      "                        '\"pull_request.closed_at\": null, '\n",
+      "                        '\"pull_request.merged_at\": null, '\n",
+      "                        '\"pull_request.merge_commit_sha\": '\n",
+      "                        '\"4b1557970247cde19eb3ea3992c324174d49a3d7\", '\n",
+      "                        '\"pull_request.locked\": false, '\n",
+      "                        '\"pull_request.assignee.login\": null, '\n",
+      "                        '\"pull_request.assignee.id\": null, '\n",
+      "                        '\"pull_request.assignee.type\": null, '\n",
+      "                        '\"pull_request.assignee.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.id\": null, '\n",
+      "                        '\"pull_request.milestone.number\": null, '\n",
+      "                        '\"pull_request.milestone.title\": null, '\n",
+      "                        '\"pull_request.milestone.description\": null, '\n",
+      "                        '\"pull_request.milestone.creator.login\": null, '\n",
+      "                        '\"pull_request.milestone.creator.id\": null, '\n",
+      "                        '\"pull_request.milestone.creator.type\": null, '\n",
+      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.open_issues\": null, '\n",
+      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
+      "                        '\"pull_request.milestone.state\": null, '\n",
+      "                        '\"pull_request.milestone.created_at\": null, '\n",
+      "                        '\"pull_request.milestone.updated_at\": null, '\n",
+      "                        '\"pull_request.milestone.due_on\": null, '\n",
+      "                        '\"pull_request.milestone.closed_at\": null, '\n",
+      "                        '\"pull_request.merged\": false, '\n",
+      "                        '\"pull_request.mergeable\": true, '\n",
+      "                        '\"pull_request.mergeable_state\": \"clean\", '\n",
+      "                        '\"pull_request.merged_by.login\": null, '\n",
+      "                        '\"pull_request.merged_by.id\": null, '\n",
+      "                        '\"pull_request.merged_by.type\": null, '\n",
+      "                        '\"pull_request.merged_by.site_admin\": null, '\n",
+      "                        '\"pull_request.comments\": 0, '\n",
+      "                        '\"pull_request.review_comments\": 0, '\n",
+      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
+      "                        '3, \"pull_request.deletions\": 3, '\n",
+      "                        '\"pull_request.changed_files\": 1, '\n",
+      "                        '\"pull_request.label.id\": null, '\n",
+      "                        '\"pull_request.label.name\": null, '\n",
+      "                        '\"pull_request.label.color\": null, '\n",
+      "                        '\"pull_request.label.default\": null, '\n",
+      "                        '\"pull_request.head.label\": \"18F:elk-rename\", '\n",
+      "                        '\"pull_request.head.ref\": \"elk-rename\", '\n",
+      "                        '\"pull_request.head.sha\": '\n",
+      "                        '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n",
+      "                        '\"pull_request.head.user.login\": \"18F\", '\n",
+      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.name\": \"C2\", '\n",
+      "                        '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n",
+      "                        '\"pull_request.head.repo.owner.login\": \"18F\", '\n",
+      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.private\": false, '\n",
+      "                        '\"pull_request.head.repo.homepage\": '\n",
+      "                        '\"https://cap.18f.gov\", '\n",
+      "                        '\"pull_request.head.repo.description\": \"an approval '\n",
+      "                        'process automation tool\", '\n",
+      "                        '\"pull_request.head.repo.fork\": false, '\n",
+      "                        '\"pull_request.head.repo.created_at\": '\n",
+      "                        '\"2014-03-28T05:15:23Z\", '\n",
+      "                        '\"pull_request.head.repo.updated_at\": '\n",
+      "                        '\"2015-11-06T02:16:44Z\", '\n",
+      "                        '\"pull_request.head.repo.pushed_at\": '\n",
+      "                        '\"2015-11-23T19:16:35Z\", '\n",
+      "                        '\"pull_request.head.repo.size\": 81432, '\n",
+      "                        '\"pull_request.head.repo.stargazers_count\": 31, '\n",
+      "                        '\"pull_request.head.repo.watchers_count\": 31, '\n",
+      "                        '\"pull_request.head.repo.language\": \"Ruby\", '\n",
+      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.head.repo.has_projects\": null, '\n",
+      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.head.repo.has_wiki\": false, '\n",
+      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.head.repo.forks_count\": 16, '\n",
+      "                        '\"pull_request.head.repo.archived\": null, '\n",
+      "                        '\"pull_request.head.repo.disabled\": null, '\n",
+      "                        '\"pull_request.head.repo.open_issues_count\": 6, '\n",
+      "                        '\"pull_request.head.repo.forks\": 16, '\n",
+      "                        '\"pull_request.head.repo.open_issues\": 6, '\n",
+      "                        '\"pull_request.head.repo.watchers\": 31, '\n",
+      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.head.repo.license.key\": null, '\n",
+      "                        '\"pull_request.head.repo.license.spdx_id\": null, '\n",
+      "                        '\"pull_request.head.repo.license.name\": null, '\n",
+      "                        '\"pull_request.base.label\": \"18F:master\", '\n",
+      "                        '\"pull_request.base.ref\": \"master\", '\n",
+      "                        '\"pull_request.base.sha\": '\n",
+      "                        '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n",
+      "                        '\"pull_request.base.user.login\": \"18F\", '\n",
+      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.name\": \"C2\", '\n",
+      "                        '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n",
+      "                        '\"pull_request.base.repo.owner.login\": \"18F\", '\n",
+      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.private\": false, '\n",
+      "                        '\"pull_request.base.repo.homepage\": '\n",
+      "                        '\"https://cap.18f.gov\", '\n",
+      "                        '\"pull_request.base.repo.description\": \"an approval '\n",
+      "                        'process automation tool\", '\n",
+      "                        '\"pull_request.base.repo.fork\": false, '\n",
+      "                        '\"pull_request.base.repo.created_at\": '\n",
+      "                        '\"2014-03-28T05:15:23Z\", '\n",
+      "                        '\"pull_request.base.repo.updated_at\": '\n",
+      "                        '\"2015-11-06T02:16:44Z\", '\n",
+      "                        '\"pull_request.base.repo.pushed_at\": '\n",
+      "                        '\"2015-11-23T19:16:35Z\", '\n",
+      "                        '\"pull_request.base.repo.size\": 81432, '\n",
+      "                        '\"pull_request.base.repo.stargazers_count\": 31, '\n",
+      "                        '\"pull_request.base.repo.watchers_count\": 31, '\n",
+      "                        '\"pull_request.base.repo.language\": \"Ruby\", '\n",
+      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.base.repo.has_projects\": null, '\n",
+      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.base.repo.has_wiki\": false, '\n",
+      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.base.repo.forks_count\": 16, '\n",
+      "                        '\"pull_request.base.repo.archived\": null, '\n",
+      "                        '\"pull_request.base.repo.disabled\": null, '\n",
+      "                        '\"pull_request.base.repo.open_issues_count\": 6, '\n",
+      "                        '\"pull_request.base.repo.forks\": 16, '\n",
+      "                        '\"pull_request.base.repo.open_issues\": 6, '\n",
+      "                        '\"pull_request.base.repo.watchers\": 31, '\n",
+      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.base.repo.license.key\": null, '\n",
+      "                        '\"pull_request.base.repo.license.spdx_id\": null, '\n",
+      "                        '\"pull_request.base.repo.license.name\": null, '\n",
+      "                        '\"pull_request.guid\": \"18F/C2/pull/820\"}, {\"type\": '\n",
+      "                        '\"PullRequestEvent\", \"action\": \"closed\", '\n",
+      "                        '\"actor.login\": \"jessieay\", \"actor.id\": 601515, '\n",
+      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
+      "                        'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n",
+      "                        '\"public\": true, \"created_at\": \"2015-11-23T22:09:46Z\", '\n",
+      "                        '\"org.id\": 6233994, \"org.login\": \"18F\", '\n",
+      "                        '\"pull_request.id\": 51566831, \"pull_request.number\": '\n",
+      "                        '820, \"pull_request.state\": \"closed\", '\n",
+      "                        '\"pull_request.title\": \"rename elk services to '\n",
+      "                        'workaround blue-green deploy bug\", '\n",
+      "                        '\"pull_request.body\": \"there\\'s a bug in the '\n",
+      "                        'cf-blue-green deploy that gets a false positive match '\n",
+      "                        'based on the current ELK naming convention. I have '\n",
+      "                        're-named all our ELK services to workaround that '\n",
+      "                        'bug.\", \"pull_request.user.login\": \"pkarman\", '\n",
+      "                        '\"pull_request.user.id\": 1205061, '\n",
+      "                        '\"pull_request.author_association\": null, '\n",
+      "                        '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n",
+      "                        '\"pull_request.updated_at\": \"2015-11-23T22:09:45Z\", '\n",
+      "                        '\"pull_request.closed_at\": \"2015-11-23T22:09:45Z\", '\n",
+      "                        '\"pull_request.merged_at\": \"2015-11-23T22:09:45Z\", '\n",
+      "                        '\"pull_request.merge_commit_sha\": '\n",
+      "                        '\"6d3c30d429a49321552973b81e1ef4cd3073157f\", '\n",
+      "                        '\"pull_request.locked\": false, '\n",
+      "                        '\"pull_request.assignee.login\": null, '\n",
+      "                        '\"pull_request.assignee.id\": null, '\n",
+      "                        '\"pull_request.assignee.type\": null, '\n",
+      "                        '\"pull_request.assignee.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.id\": null, '\n",
+      "                        '\"pull_request.milestone.number\": null, '\n",
+      "                        '\"pull_request.milestone.title\": null, '\n",
+      "                        '\"pull_request.milestone.description\": null, '\n",
+      "                        '\"pull_request.milestone.creator.login\": null, '\n",
+      "                        '\"pull_request.milestone.creator.id\": null, '\n",
+      "                        '\"pull_request.milestone.creator.type\": null, '\n",
+      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
+      "                        '\"pull_request.milestone.open_issues\": null, '\n",
+      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
+      "                        '\"pull_request.milestone.state\": null, '\n",
+      "                        '\"pull_request.milestone.created_at\": null, '\n",
+      "                        '\"pull_request.milestone.updated_at\": null, '\n",
+      "                        '\"pull_request.milestone.due_on\": null, '\n",
+      "                        '\"pull_request.milestone.closed_at\": null, '\n",
+      "                        '\"pull_request.merged\": true, '\n",
+      "                        '\"pull_request.mergeable\": null, '\n",
+      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
+      "                        '\"pull_request.merged_by.login\": \"jessieay\", '\n",
+      "                        '\"pull_request.merged_by.id\": 601515, '\n",
+      "                        '\"pull_request.merged_by.type\": \"User\", '\n",
+      "                        '\"pull_request.merged_by.site_admin\": false, '\n",
+      "                        '\"pull_request.comments\": 1, '\n",
+      "                        '\"pull_request.review_comments\": 0, '\n",
+      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
+      "                        '3, \"pull_request.deletions\": 3, '\n",
+      "                        '\"pull_request.changed_files\": 1, '\n",
+      "                        '\"pull_request.label.id\": null, '\n",
+      "                        '\"pull_request.label.name\": null, '\n",
+      "                        '\"pull_request.label.color\": null, '\n",
+      "                        '\"pull_request.label.default\": null, '\n",
+      "                        '\"pull_request.head.label\": \"18F:elk-rename\", '\n",
+      "                        '\"pull_request.head.ref\": \"elk-rename\", '\n",
+      "                        '\"pull_request.head.sha\": '\n",
+      "                        '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n",
+      "                        '\"pull_request.head.user.login\": \"18F\", '\n",
+      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.name\": \"C2\", '\n",
+      "                        '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n",
+      "                        '\"pull_request.head.repo.owner.login\": \"18F\", '\n",
+      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.head.repo.private\": false, '\n",
+      "                        '\"pull_request.head.repo.homepage\": '\n",
+      "                        '\"https://cap.18f.gov\", '\n",
+      "                        '\"pull_request.head.repo.description\": \"an approval '\n",
+      "                        'process automation tool\", '\n",
+      "                        '\"pull_request.head.repo.fork\": false, '\n",
+      "                        '\"pull_request.head.repo.created_at\": '\n",
+      "                        '\"2014-03-28T05:15:23Z\", '\n",
+      "                        '\"pull_request.head.repo.updated_at\": '\n",
+      "                        '\"2015-11-06T02:16:44Z\", '\n",
+      "                        '\"pull_request.head.repo.pushed_at\": '\n",
+      "                        '\"2015-11-23T22:09:45Z\", '\n",
+      "                        '\"pull_request.head.repo.size\": 81440, '\n",
+      "                        '\"pull_request.head.repo.stargazers_count\": 31, '\n",
+      "                        '\"pull_request.head.repo.watchers_count\": 31, '\n",
+      "                        '\"pull_request.head.repo.language\": \"Ruby\", '\n",
+      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.head.repo.has_projects\": null, '\n",
+      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.head.repo.has_wiki\": false, '\n",
+      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.head.repo.forks_count\": 16, '\n",
+      "                        '\"pull_request.head.repo.archived\": null, '\n",
+      "                        '\"pull_request.head.repo.disabled\": null, '\n",
+      "                        '\"pull_request.head.repo.open_issues_count\": 4, '\n",
+      "                        '\"pull_request.head.repo.forks\": 16, '\n",
+      "                        '\"pull_request.head.repo.open_issues\": 4, '\n",
+      "                        '\"pull_request.head.repo.watchers\": 31, '\n",
+      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.head.repo.license.key\": null, '\n",
+      "                        '\"pull_request.head.repo.license.spdx_id\": null, '\n",
+      "                        '\"pull_request.head.repo.license.name\": null, '\n",
+      "                        '\"pull_request.base.label\": \"18F:master\", '\n",
+      "                        '\"pull_request.base.ref\": \"master\", '\n",
+      "                        '\"pull_request.base.sha\": '\n",
+      "                        '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n",
+      "                        '\"pull_request.base.user.login\": \"18F\", '\n",
+      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.name\": \"C2\", '\n",
+      "                        '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n",
+      "                        '\"pull_request.base.repo.owner.login\": \"18F\", '\n",
+      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
+      "                        '\"pull_request.base.repo.private\": false, '\n",
+      "                        '\"pull_request.base.repo.homepage\": '\n",
+      "                        '\"https://cap.18f.gov\", '\n",
+      "                        '\"pull_request.base.repo.description\": \"an approval '\n",
+      "                        'process automation tool\", '\n",
+      "                        '\"pull_request.base.repo.fork\": false, '\n",
+      "                        '\"pull_request.base.repo.created_at\": '\n",
+      "                        '\"2014-03-28T05:15:23Z\", '\n",
+      "                        '\"pull_request.base.repo.updated_at\": '\n",
+      "                        '\"2015-11-06T02:16:44Z\", '\n",
+      "                        '\"pull_request.base.repo.pushed_at\": '\n",
+      "                        '\"2015-11-23T22:09:45Z\", '\n",
+      "                        '\"pull_request.base.repo.size\": 81440, '\n",
+      "                        '\"pull_request.base.repo.stargazers_count\": 31, '\n",
+      "                        '\"pull_request.base.repo.watchers_count\": 31, '\n",
+      "                        '\"pull_request.base.repo.language\": \"Ruby\", '\n",
+      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
+      "                        '\"pull_request.base.repo.has_projects\": null, '\n",
+      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
+      "                        '\"pull_request.base.repo.has_wiki\": false, '\n",
+      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
+      "                        '\"pull_request.base.repo.forks_count\": 16, '\n",
+      "                        '\"pull_request.base.repo.archived\": null, '\n",
+      "                        '\"pull_request.base.repo.disabled\": null, '\n",
+      "                        '\"pull_request.base.repo.open_issues_count\": 4, '\n",
+      "                        '\"pull_request.base.repo.forks\": 16, '\n",
+      "                        '\"pull_request.base.repo.open_issues\": 4, '\n",
+      "                        '\"pull_request.base.repo.watchers\": 31, '\n",
+      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
+      "                        '\"pull_request.base.repo.license.key\": null, '\n",
+      "                        '\"pull_request.base.repo.license.spdx_id\": null, '\n",
+      "                        '\"pull_request.base.repo.license.name\": null, '\n",
+      "                        '\"pull_request.guid\": \"18F/C2/pull/820\"}]',\n",
+      " 'pull_request.guid': '18F/C2/pull/820',\n",
+      " 'pull_request.issue_events': '{\"repo\": \"18F/C2\", \"org\": \"18F\", \"issue_id\": '\n",
+      "                              '118451607, \"issue_number\": 820, \"pull_request\": '\n",
+      "                              '{\"number\": 820.0, \"repo\": \"C2\", \"user_login\": '\n",
+      "                              '\"18F\"}, \"events\": [{\"action\": \"opened\", '\n",
+      "                              '\"author\": \"pkarman\", \"comment\": null, '\n",
+      "                              '\"comment_id\": null, \"datetime\": '\n",
+      "                              '\"2015-11-23T19:16:34Z\", \"description\": '\n",
+      "                              '\"there\\'s a bug in the cf-blue-green deploy '\n",
+      "                              'that gets a false positive match based on the '\n",
+      "                              'current ELK naming convention. I have re-named '\n",
+      "                              'all our ELK services to workaround that bug.\", '\n",
+      "                              '\"title\": \"rename elk services to workaround '\n",
+      "                              'blue-green deploy bug\", \"type\": \"issue\"}, '\n",
+      "                              '{\"action\": \"created\", \"author\": \"jessieay\", '\n",
+      "                              '\"comment\": \"wish there were a good way to write '\n",
+      "                              'tests for this type of thing...\\\\r\\\\n\\\\r\\\\nbut '\n",
+      "                              'LGTM. merging. \", \"comment_id\": 159082113.0, '\n",
+      "                              '\"datetime\": \"2015-11-23 22:09:43+00:00\", '\n",
+      "                              '\"description\": null, \"title\": null, \"type\": '\n",
+      "                              '\"comment\"}]}'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(small_ds[8])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 327,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actions = []\n",
+    "c = 0\n",
+    "for events in ds[\"events\"]:\n",
+    "    c += 1\n",
+    "    actions.extend([event[\"action\"] for event in events])\n",
+    "    if c > 10000:\n",
+    "        break\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 328,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'closed', 'created', 'opened', 'reopened'}"
+      ]
+     },
+     "execution_count": 328,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "set(actions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 322,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'base_repo_info': {'pull_request.base.label': '1011X:master',\n",
+      "                    'pull_request.base.ref': 'master',\n",
+      "                    'pull_request.base.repo.default_branch': 'master',\n",
+      "                    'pull_request.base.repo.description': 'Representing '\n",
+      "                                                          'rational numbers '\n",
+      "                                                          'using the '\n",
+      "                                                          'floating-bar number '\n",
+      "                                                          'type.',\n",
+      "                    'pull_request.base.repo.forks_count': 2,\n",
+      "                    'pull_request.base.repo.homepage': None,\n",
+      "                    'pull_request.base.repo.language': 'Rust',\n",
+      "                    'pull_request.base.repo.license.name': 'Other',\n",
+      "                    'pull_request.base.repo.name': 'floating_bar',\n",
+      "                    'pull_request.base.repo.open_issues_count': 6,\n",
+      "                    'pull_request.base.repo.owner.login': '1011X',\n",
+      "                    'pull_request.base.repo.owner.type': 'User',\n",
+      "                    'pull_request.base.repo.private': False,\n",
+      "                    'pull_request.base.repo.stargazers_count': 15,\n",
+      "                    'pull_request.base.repo.watchers_count': 15,\n",
+      "                    'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n",
+      "                    'pull_request.base.user.login': '1011X',\n",
+      "                    'pull_request.base.user.type': 'User',\n",
+      "                    'pull_request.comments': 0,\n",
+      "                    'pull_request.label.name': None,\n",
+      "                    'pull_request.review_comments': 0},\n",
+      " 'bucket': '940',\n",
+      " 'events': [{'action': 'opened',\n",
+      "             'actor.id': None,\n",
+      "             'actor.login': None,\n",
+      "             'comment.author_association': None,\n",
+      "             'comment.body': None,\n",
+      "             'comment.commit_id': None,\n",
+      "             'comment.created_at': None,\n",
+      "             'comment.diff_hunk': None,\n",
+      "             'comment.id': None,\n",
+      "             'comment.in_reply_to_id': None,\n",
+      "             'comment.line': None,\n",
+      "             'comment.original_commit_id': None,\n",
+      "             'comment.original_line': None,\n",
+      "             'comment.original_position': None,\n",
+      "             'comment.original_start_line': None,\n",
+      "             'comment.path': None,\n",
+      "             'comment.position': None,\n",
+      "             'comment.side': None,\n",
+      "             'comment.start_line': None,\n",
+      "             'comment.start_side': None,\n",
+      "             'comment.updated_at': None,\n",
+      "             'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=<UTC>),\n",
+      "             'issue.author': 'ZoeyR',\n",
+      "             'issue.comment': None,\n",
+      "             'issue.comment_id': None,\n",
+      "             'pull_request.merged': None,\n",
+      "             'pull_request.merged_by.login': None,\n",
+      "             'pull_request.merged_by.type': None,\n",
+      "             'pull_request.state': None,\n",
+      "             'review.author_association': None,\n",
+      "             'review.body': None,\n",
+      "             'review.commit_id': None,\n",
+      "             'review.id': None,\n",
+      "             'review.state': None,\n",
+      "             'review.submitted_at': None,\n",
+      "             'type': 'issue',\n",
+      "             'user.login': None,\n",
+      "             'user.type': None},\n",
+      "            {'action': 'opened',\n",
+      "             'actor.id': 8010244,\n",
+      "             'actor.login': 'ZoeyR',\n",
+      "             'comment.author_association': None,\n",
+      "             'comment.body': None,\n",
+      "             'comment.commit_id': None,\n",
+      "             'comment.created_at': None,\n",
+      "             'comment.diff_hunk': None,\n",
+      "             'comment.id': None,\n",
+      "             'comment.in_reply_to_id': None,\n",
+      "             'comment.line': None,\n",
+      "             'comment.original_commit_id': None,\n",
+      "             'comment.original_line': None,\n",
+      "             'comment.original_position': None,\n",
+      "             'comment.original_start_line': None,\n",
+      "             'comment.path': None,\n",
+      "             'comment.position': None,\n",
+      "             'comment.side': None,\n",
+      "             'comment.start_line': None,\n",
+      "             'comment.start_side': None,\n",
+      "             'comment.updated_at': None,\n",
+      "             'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=<UTC>),\n",
+      "             'issue.author': None,\n",
+      "             'issue.comment': None,\n",
+      "             'issue.comment_id': None,\n",
+      "             'pull_request.merged': False,\n",
+      "             'pull_request.merged_by.login': None,\n",
+      "             'pull_request.merged_by.type': None,\n",
+      "             'pull_request.state': 'open',\n",
+      "             'review.author_association': None,\n",
+      "             'review.body': None,\n",
+      "             'review.commit_id': None,\n",
+      "             'review.id': None,\n",
+      "             'review.state': None,\n",
+      "             'review.submitted_at': None,\n",
+      "             'type': 'PullRequestEvent',\n",
+      "             'user.login': None,\n",
+      "             'user.type': None},\n",
+      "            {'action': 'created',\n",
+      "             'actor.id': None,\n",
+      "             'actor.login': None,\n",
+      "             'comment.author_association': None,\n",
+      "             'comment.body': None,\n",
+      "             'comment.commit_id': None,\n",
+      "             'comment.created_at': None,\n",
+      "             'comment.diff_hunk': None,\n",
+      "             'comment.id': None,\n",
+      "             'comment.in_reply_to_id': None,\n",
+      "             'comment.line': None,\n",
+      "             'comment.original_commit_id': None,\n",
+      "             'comment.original_line': None,\n",
+      "             'comment.original_position': None,\n",
+      "             'comment.original_start_line': None,\n",
+      "             'comment.path': None,\n",
+      "             'comment.position': None,\n",
+      "             'comment.side': None,\n",
+      "             'comment.start_line': None,\n",
+      "             'comment.start_side': None,\n",
+      "             'comment.updated_at': None,\n",
+      "             'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=<UTC>),\n",
+      "             'issue.author': '1011X',\n",
+      "             'issue.comment': 'LGTM, thank you!',\n",
+      "             'issue.comment_id': 835503633.0,\n",
+      "             'pull_request.merged': None,\n",
+      "             'pull_request.merged_by.login': None,\n",
+      "             'pull_request.merged_by.type': None,\n",
+      "             'pull_request.state': None,\n",
+      "             'review.author_association': None,\n",
+      "             'review.body': None,\n",
+      "             'review.commit_id': None,\n",
+      "             'review.id': None,\n",
+      "             'review.state': None,\n",
+      "             'review.submitted_at': None,\n",
+      "             'type': 'comment',\n",
+      "             'user.login': None,\n",
+      "             'user.type': None},\n",
+      "            {'action': 'closed',\n",
+      "             'actor.id': 1851619,\n",
+      "             'actor.login': '1011X',\n",
+      "             'comment.author_association': None,\n",
+      "             'comment.body': None,\n",
+      "             'comment.commit_id': None,\n",
+      "             'comment.created_at': None,\n",
+      "             'comment.diff_hunk': None,\n",
+      "             'comment.id': None,\n",
+      "             'comment.in_reply_to_id': None,\n",
+      "             'comment.line': None,\n",
+      "             'comment.original_commit_id': None,\n",
+      "             'comment.original_line': None,\n",
+      "             'comment.original_position': None,\n",
+      "             'comment.original_start_line': None,\n",
+      "             'comment.path': None,\n",
+      "             'comment.position': None,\n",
+      "             'comment.side': None,\n",
+      "             'comment.start_line': None,\n",
+      "             'comment.start_side': None,\n",
+      "             'comment.updated_at': None,\n",
+      "             'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=<UTC>),\n",
+      "             'issue.author': None,\n",
+      "             'issue.comment': None,\n",
+      "             'issue.comment_id': None,\n",
+      "             'pull_request.merged': True,\n",
+      "             'pull_request.merged_by.login': '1011X',\n",
+      "             'pull_request.merged_by.type': 'User',\n",
+      "             'pull_request.state': 'closed',\n",
+      "             'review.author_association': None,\n",
+      "             'review.body': None,\n",
+      "             'review.commit_id': None,\n",
+      "             'review.id': None,\n",
+      "             'review.state': None,\n",
+      "             'review.submitted_at': None,\n",
+      "             'type': 'PullRequestEvent',\n",
+      "             'user.login': None,\n",
+      "             'user.type': None}],\n",
+      " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n",
+      "                    'pull_request.head.ref': 'fractional-benches',\n",
+      "                    'pull_request.head.repo.default_branch': 'master',\n",
+      "                    'pull_request.head.repo.description': 'Representing '\n",
+      "                                                          'rational numbers '\n",
+      "                                                          'using the '\n",
+      "                                                          'floating-bar number '\n",
+      "                                                          'type.',\n",
+      "                    'pull_request.head.repo.homepage': None,\n",
+      "                    'pull_request.head.repo.language': None,\n",
+      "                    'pull_request.head.repo.license.name': 'Other',\n",
+      "                    'pull_request.head.repo.name': 'floating_bar',\n",
+      "                    'pull_request.head.repo.owner.login': 'ZoeyR',\n",
+      "                    'pull_request.head.repo.owner.type': 'User',\n",
+      "                    'pull_request.head.repo.private': False,\n",
+      "                    'pull_request.head.repo.stargazers_count': 0,\n",
+      "                    'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n",
+      "                    'pull_request.head.user.login': 'ZoeyR',\n",
+      "                    'pull_request.head.user.type': 'User'},\n",
+      " 'pull_request_info': {'org.id': None,\n",
+      "                       'public': True,\n",
+      "                       'pull_request.additions': 23,\n",
+      "                       'pull_request.base.user.type': 'User',\n",
+      "                       'pull_request.body': '',\n",
+      "                       'pull_request.changed_files': 4,\n",
+      "                       'pull_request.closed_at': None,\n",
+      "                       'pull_request.comments': 0,\n",
+      "                       'pull_request.commits': 1,\n",
+      "                       'pull_request.created_at': '2021-05-08T20:30:31Z',\n",
+      "                       'pull_request.deletions': 19,\n",
+      "                       'pull_request.guid': '1011X/floating_bar/pull/7',\n",
+      "                       'pull_request.head.user.type': 'User',\n",
+      "                       'pull_request.id': 634875503,\n",
+      "                       'pull_request.merged_at': None,\n",
+      "                       'pull_request.merged_by.login': None,\n",
+      "                       'pull_request.milestone.description': None,\n",
+      "                       'pull_request.milestone.number': None,\n",
+      "                       'pull_request.milestone.title': None,\n",
+      "                       'pull_request.number': 7,\n",
+      "                       'pull_request.review_comments': 0,\n",
+      "                       'pull_request.state': 'open',\n",
+      "                       'pull_request.title': 'change benches to use fractional '\n",
+      "                                             'values',\n",
+      "                       'pull_request.user.id': 8010244,\n",
+      "                       'pull_request.user.login': 'ZoeyR',\n",
+      "                       'repo.id': 166723951,\n",
+      "                       'repo.name': '1011X/floating_bar'}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(ds[6])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 318,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'bucket': '940',\n",
+       " 'pull_request_info': {'org.id': None,\n",
+       "  'public': True,\n",
+       "  'pull_request.additions': 23,\n",
+       "  'pull_request.base.user.type': 'User',\n",
+       "  'pull_request.body': '',\n",
+       "  'pull_request.changed_files': 4,\n",
+       "  'pull_request.closed_at': None,\n",
+       "  'pull_request.comments': 0,\n",
+       "  'pull_request.commits': 1,\n",
+       "  'pull_request.created_at': '2021-05-08T20:30:31Z',\n",
+       "  'pull_request.deletions': 19,\n",
+       "  'pull_request.guid': '1011X/floating_bar/pull/7',\n",
+       "  'pull_request.head.user.type': 'User',\n",
+       "  'pull_request.id': 634875503,\n",
+       "  'pull_request.merged_at': None,\n",
+       "  'pull_request.merged_by.login': None,\n",
+       "  'pull_request.milestone.description': None,\n",
+       "  'pull_request.milestone.number': None,\n",
+       "  'pull_request.milestone.title': None,\n",
+       "  'pull_request.number': 7,\n",
+       "  'pull_request.review_comments': 0,\n",
+       "  'pull_request.state': 'open',\n",
+       "  'pull_request.title': 'change benches to use fractional values',\n",
+       "  'pull_request.user.id': 8010244,\n",
+       "  'pull_request.user.login': 'ZoeyR',\n",
+       "  'repo.id': 166723951,\n",
+       "  'repo.name': '1011X/floating_bar'},\n",
+       " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n",
+       "  'pull_request.head.ref': 'fractional-benches',\n",
+       "  'pull_request.head.repo.default_branch': 'master',\n",
+       "  'pull_request.head.repo.description': 'Representing rational numbers using the floating-bar number type.',\n",
+       "  'pull_request.head.repo.homepage': None,\n",
+       "  'pull_request.head.repo.language': None,\n",
+       "  'pull_request.head.repo.license.name': 'Other',\n",
+       "  'pull_request.head.repo.name': 'floating_bar',\n",
+       "  'pull_request.head.repo.owner.login': 'ZoeyR',\n",
+       "  'pull_request.head.repo.owner.type': 'User',\n",
+       "  'pull_request.head.repo.private': False,\n",
+       "  'pull_request.head.repo.stargazers_count': 0,\n",
+       "  'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n",
+       "  'pull_request.head.user.login': 'ZoeyR',\n",
+       "  'pull_request.head.user.type': 'User'},\n",
+       " 'base_repo_info': {'pull_request.base.label': '1011X:master',\n",
+       "  'pull_request.base.ref': 'master',\n",
+       "  'pull_request.base.repo.default_branch': 'master',\n",
+       "  'pull_request.base.repo.description': 'Representing rational numbers using the floating-bar number type.',\n",
+       "  'pull_request.base.repo.forks_count': 2,\n",
+       "  'pull_request.base.repo.homepage': None,\n",
+       "  'pull_request.base.repo.language': 'Rust',\n",
+       "  'pull_request.base.repo.license.name': 'Other',\n",
+       "  'pull_request.base.repo.name': 'floating_bar',\n",
+       "  'pull_request.base.repo.open_issues_count': 6,\n",
+       "  'pull_request.base.repo.owner.login': '1011X',\n",
+       "  'pull_request.base.repo.owner.type': 'User',\n",
+       "  'pull_request.base.repo.private': False,\n",
+       "  'pull_request.base.repo.stargazers_count': 15,\n",
+       "  'pull_request.base.repo.watchers_count': 15,\n",
+       "  'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n",
+       "  'pull_request.base.user.login': '1011X',\n",
+       "  'pull_request.base.user.type': 'User',\n",
+       "  'pull_request.comments': 0,\n",
+       "  'pull_request.label.name': None,\n",
+       "  'pull_request.review_comments': 0},\n",
+       " 'events': [{'action': 'opened',\n",
+       "   'actor.id': None,\n",
+       "   'actor.login': None,\n",
+       "   'comment.author_association': None,\n",
+       "   'comment.body': None,\n",
+       "   'comment.commit_id': None,\n",
+       "   'comment.created_at': None,\n",
+       "   'comment.diff_hunk': None,\n",
+       "   'comment.id': None,\n",
+       "   'comment.in_reply_to_id': None,\n",
+       "   'comment.line': None,\n",
+       "   'comment.original_commit_id': None,\n",
+       "   'comment.original_line': None,\n",
+       "   'comment.original_position': None,\n",
+       "   'comment.original_start_line': None,\n",
+       "   'comment.path': None,\n",
+       "   'comment.position': None,\n",
+       "   'comment.side': None,\n",
+       "   'comment.start_line': None,\n",
+       "   'comment.start_side': None,\n",
+       "   'comment.updated_at': None,\n",
+       "   'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=<UTC>),\n",
+       "   'issue.author': 'ZoeyR',\n",
+       "   'issue.comment': None,\n",
+       "   'issue.comment_id': None,\n",
+       "   'pull_request.merged': None,\n",
+       "   'pull_request.merged_by.login': None,\n",
+       "   'pull_request.merged_by.type': None,\n",
+       "   'pull_request.state': None,\n",
+       "   'review.author_association': None,\n",
+       "   'review.body': None,\n",
+       "   'review.commit_id': None,\n",
+       "   'review.id': None,\n",
+       "   'review.state': None,\n",
+       "   'review.submitted_at': None,\n",
+       "   'type': 'issue',\n",
+       "   'user.login': None,\n",
+       "   'user.type': None},\n",
+       "  {'action': 'opened',\n",
+       "   'actor.id': 8010244,\n",
+       "   'actor.login': 'ZoeyR',\n",
+       "   'comment.author_association': None,\n",
+       "   'comment.body': None,\n",
+       "   'comment.commit_id': None,\n",
+       "   'comment.created_at': None,\n",
+       "   'comment.diff_hunk': None,\n",
+       "   'comment.id': None,\n",
+       "   'comment.in_reply_to_id': None,\n",
+       "   'comment.line': None,\n",
+       "   'comment.original_commit_id': None,\n",
+       "   'comment.original_line': None,\n",
+       "   'comment.original_position': None,\n",
+       "   'comment.original_start_line': None,\n",
+       "   'comment.path': None,\n",
+       "   'comment.position': None,\n",
+       "   'comment.side': None,\n",
+       "   'comment.start_line': None,\n",
+       "   'comment.start_side': None,\n",
+       "   'comment.updated_at': None,\n",
+       "   'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=<UTC>),\n",
+       "   'issue.author': None,\n",
+       "   'issue.comment': None,\n",
+       "   'issue.comment_id': None,\n",
+       "   'pull_request.merged': False,\n",
+       "   'pull_request.merged_by.login': None,\n",
+       "   'pull_request.merged_by.type': None,\n",
+       "   'pull_request.state': 'open',\n",
+       "   'review.author_association': None,\n",
+       "   'review.body': None,\n",
+       "   'review.commit_id': None,\n",
+       "   'review.id': None,\n",
+       "   'review.state': None,\n",
+       "   'review.submitted_at': None,\n",
+       "   'type': 'PullRequestEvent',\n",
+       "   'user.login': None,\n",
+       "   'user.type': None},\n",
+       "  {'action': 'created',\n",
+       "   'actor.id': None,\n",
+       "   'actor.login': None,\n",
+       "   'comment.author_association': None,\n",
+       "   'comment.body': None,\n",
+       "   'comment.commit_id': None,\n",
+       "   'comment.created_at': None,\n",
+       "   'comment.diff_hunk': None,\n",
+       "   'comment.id': None,\n",
+       "   'comment.in_reply_to_id': None,\n",
+       "   'comment.line': None,\n",
+       "   'comment.original_commit_id': None,\n",
+       "   'comment.original_line': None,\n",
+       "   'comment.original_position': None,\n",
+       "   'comment.original_start_line': None,\n",
+       "   'comment.path': None,\n",
+       "   'comment.position': None,\n",
+       "   'comment.side': None,\n",
+       "   'comment.start_line': None,\n",
+       "   'comment.start_side': None,\n",
+       "   'comment.updated_at': None,\n",
+       "   'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=<UTC>),\n",
+       "   'issue.author': '1011X',\n",
+       "   'issue.comment': 'LGTM, thank you!',\n",
+       "   'issue.comment_id': 835503633.0,\n",
+       "   'pull_request.merged': None,\n",
+       "   'pull_request.merged_by.login': None,\n",
+       "   'pull_request.merged_by.type': None,\n",
+       "   'pull_request.state': None,\n",
+       "   'review.author_association': None,\n",
+       "   'review.body': None,\n",
+       "   'review.commit_id': None,\n",
+       "   'review.id': None,\n",
+       "   'review.state': None,\n",
+       "   'review.submitted_at': None,\n",
+       "   'type': 'comment',\n",
+       "   'user.login': None,\n",
+       "   'user.type': None},\n",
+       "  {'action': 'closed',\n",
+       "   'actor.id': 1851619,\n",
+       "   'actor.login': '1011X',\n",
+       "   'comment.author_association': None,\n",
+       "   'comment.body': None,\n",
+       "   'comment.commit_id': None,\n",
+       "   'comment.created_at': None,\n",
+       "   'comment.diff_hunk': None,\n",
+       "   'comment.id': None,\n",
+       "   'comment.in_reply_to_id': None,\n",
+       "   'comment.line': None,\n",
+       "   'comment.original_commit_id': None,\n",
+       "   'comment.original_line': None,\n",
+       "   'comment.original_position': None,\n",
+       "   'comment.original_start_line': None,\n",
+       "   'comment.path': None,\n",
+       "   'comment.position': None,\n",
+       "   'comment.side': None,\n",
+       "   'comment.start_line': None,\n",
+       "   'comment.start_side': None,\n",
+       "   'comment.updated_at': None,\n",
+       "   'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=<UTC>),\n",
+       "   'issue.author': None,\n",
+       "   'issue.comment': None,\n",
+       "   'issue.comment_id': None,\n",
+       "   'pull_request.merged': True,\n",
+       "   'pull_request.merged_by.login': '1011X',\n",
+       "   'pull_request.merged_by.type': 'User',\n",
+       "   'pull_request.state': 'closed',\n",
+       "   'review.author_association': None,\n",
+       "   'review.body': None,\n",
+       "   'review.commit_id': None,\n",
+       "   'review.id': None,\n",
+       "   'review.state': None,\n",
+       "   'review.submitted_at': None,\n",
+       "   'type': 'PullRequestEvent',\n",
+       "   'user.login': None,\n",
+       "   'user.type': None}]}"
+      ]
+     },
+     "execution_count": 318,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample = ds[6]\n",
+    "sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample = ds[0]\n",
+    "pr_info = sample[\"pull_request_info\"]\n",
+    "head_info = sample[\"head_repo_info\"]\n",
+    "base_info = sample[\"base_repo_info\"]\n",
+    "events = sample[\"events\"]\n",
+    "\n",
+    "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
+    "\n",
+    "header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}<br>\n",
+    "📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.<br>\n",
+    "Link: [{gh_link}]({gh_link})\"\"\"\n",
+    "pr_info_html = f\"\"\"\n",
+    "<table style=\"width:100%\">\n",
+    "    <tr><th>Attribute</th><th>Detail</th></tr>\n",
+    "    <tr><td>🧾 <strong>PR Type</strong></td><td>{events[0]['type']}</td></tr>\n",
+    "    <tr><td>🟢 <strong>PR State</strong></td><td>{pr_info['pull_request.state']}</td></tr>\n",
+    "    <tr><td>👤 <strong>PR Author</strong></td><td>{pr_info['pull_request.user.login']}</td></tr>\n",
+    "    <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}</td></tr>\n",
+    "    <tr><td>🌳 <strong>Base Branch</strong></td><td>{base_info['pull_request.base.ref']}</td></tr>\n",
+    "</table>\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<table style=\"width:100%\">\n",
+       "    <tr><th>Attribute</th><th>Detail</th></tr>\n",
+       "    <tr><td>🧾 <strong>PR Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "    <tr><td>🟢 <strong>PR State</strong></td><td>open</td></tr>\n",
+       "    <tr><td>👤 <strong>PR Author</strong></td><td>dependabot[bot]</td></tr>\n",
+       "    <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: dependabot/npm_and_yarn/qs-6.5.3, label: AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3</td></tr>\n",
+       "    <tr><td>🌳 <strong>Base Branch</strong></td><td>master</td></tr>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# display pr_info_html as HTML\n",
+    "from IPython.display import HTML, display\n",
+    "display(HTML(pr_info_html))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>action</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>created_at</th>\n",
+       "      <th>type</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>opened</td>\n",
+       "      <td>{'actor.id': 49699333, 'actor.login': 'dependa...</td>\n",
+       "      <td>2022-12-10 03:27:08+00:00</td>\n",
+       "      <td>PullRequestEvent</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   action                                           comments   \n",
+       "0  opened  {'actor.id': 49699333, 'actor.login': 'dependa...  \\\n",
+       "\n",
+       "                 created_at              type  \n",
+       "0 2022-12-10 03:27:08+00:00  PullRequestEvent  "
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame(events)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div class=\"thread\">\n",
+       "    <div class=\"event\">\n",
+       "        <table style=\"width:100%; border: 0;\">\n",
+       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
+       "            <tr><td><strong>User</strong></td><td>None (type :<strong>None</strong>)</td></tr>\n",
+       "            <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
+       "            <tr><td><strong>From Head</strong></td><td>AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3</td></tr>\n",
+       "        </table>\n",
+       "    </div>\n",
+       "    </div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def create_grouped_events(events):\n",
+    "    df = pd.DataFrame(events)\n",
+    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "    df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n",
+    "    # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n",
+    "    if len(df) == 1:\n",
+    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
+    "    else:\n",
+    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n",
+    "    return grouped_events\n",
+    "    \n",
+    "events = sample[\"events\"]\n",
+    "grouped_events = create_grouped_events(events)\n",
+    "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
+    "for thread in grouped_events:\n",
+    "    # Start a new thread\n",
+    "    thread_html = '<div class=\"thread\">'\n",
+    "    # Get the first event in the thread as a reference\n",
+    "    first_event = thread[0]\n",
+    "    \n",
+    "    # Add shared parts of the events only once\n",
+    "    text = f\"\"\"\n",
+    "    <div class=\"event\">\n",
+    "        <table style=\"width:100%; border: 0;\">\n",
+    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
+    "            <tr><td><strong>User</strong></td><td>{first_event['user.login']} (type :<strong>{first_event['user.type']}</strong>)</td></tr>\n",
+    "            <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
+    "            <tr><td><strong>From Head</strong></td><td>{head_info['pull_request.head.label']}</td></tr>\n",
+    "        </table>\n",
+    "    </div>\n",
+    "    \"\"\"\n",
+    "    thread_html += text\n",
+    "    \n",
+    "    # Add the bodies of the comments for each event in the thread\n",
+    "    for event in thread:\n",
+    "        if event['comment.body']:\n",
+    "            is_op = original_poster == event['user.login']\n",
+    "            thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n",
+    "    thread_html += '</div>'\n",
+    "    display(HTML(thread_html))\n",
+    "    if first_event['comment.path']:\n",
+    "        path_html = f\"<code style='font-family: monospace; font-weight: bold;'>Path:</code> {first_event['comment.path']}\"\n",
+    "        display(HTML(path_html))\n",
+    "    if first_event[\"comment.diff_hunk\"]:\n",
+    "        print(first_event[\"comment.diff_hunk\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'dependabot[bot]'"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample[\"pull_request_info\"]['pull_request.user.login']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import ghdiff\n",
+    "import streamlit as st\n",
+    "import streamlit.components.v1 as components\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "\n",
+    "# save dataset as in \"bigcode/code_reviews_sample\"\n",
+    "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n",
+    "size = len(ds)\n",
+    "\n",
+    "def show_diff_hunk(diff_hunk, position, context=5):\n",
+    "    # exclude the first line with the @@ notation\n",
+    "    lines = diff_hunk.split('\\n')\n",
+    "    start_line = max(int(position) - context - 1, 0)\n",
+    "    end_line = int(position)\n",
+    "    actual_diff = lines[0] + '\\n' + '\\n'.join(lines[start_line + 1:end_line + 1])\n",
+    "    focus = ghdiff.colorize(actual_diff)\n",
+    "    full = ghdiff.colorize(diff_hunk)\n",
+    "    # Wrap the diff hunk inside a scrollable div\n",
+    "    scrollable_focus = f'<div style=\"height:500px;overflow:auto;\">{focus}</div>'\n",
+    "    scrollable_full = f'<div style=\"height:500px;overflow:auto;\">{full}</div>'\n",
+    "    if len(lines) <= 12:\n",
+    "        return None, scrollable_full\n",
+    "    return scrollable_focus, scrollable_full\n",
+    "\n",
+    "\n",
+    "def format_body(text, user, is_op=False):\n",
+    "    color = \"#007bff\" if is_op else \"black\"\n",
+    "    pr_body = f\"<div style='background-color: #f0f0f0; padding: 10px;'>👤<strong style='color: {color};'>{user}</strong>: {text}</div>\"\n",
+    "    return pr_body\n",
+    "\n",
+    "\n",
+    "def create_grouped_events(events):\n",
+    "    df = pd.DataFrame(events)\n",
+    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
+    "    df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n",
+    "    # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n",
+    "    if len(df) == 1:\n",
+    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
+    "    else:\n",
+    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n",
+    "    return grouped_events\n",
+    "\n",
+    "\n",
+    "def get_pr_info(sample):\n",
+    "    pr_info = sample[\"pull_request_info\"]\n",
+    "    head_info = sample[\"head_repo_info\"]\n",
+    "    base_info = sample[\"base_repo_info\"]\n",
+    "    events = sample[\"events\"]\n",
+    "\n",
+    "    gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
+    "    \n",
+    "    header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}<br>\n",
+    "    📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.<br>\n",
+    "    Link: [{gh_link}]({gh_link})\"\"\"\n",
+    "    pr_info_html = f\"\"\"\n",
+    "    <table style=\"width:100%\">\n",
+    "        <tr><th>Attribute</th><th>Detail</th></tr>\n",
+    "        <tr><td>🧾 <strong>PR Type</strong></td><td>{events[0]['type']}</td></tr>\n",
+    "        <tr><td>🟢 <strong>PR State</strong></td><td>{pr_info['pull_request.state']}</td></tr>\n",
+    "        <tr><td>👤 <strong>PR Author</strong></td><td>{pr_info['pull_request.user.login']}</td></tr>\n",
+    "        <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}</td></tr>\n",
+    "        <tr><td>🌳 <strong>Base Branch</strong></td><td>{base_info['pull_request.base.ref']}</td></tr>\n",
+    "    </table>\n",
+    "    \"\"\"\n",
+    "    return header, pr_info_html\n",
+    "\n",
+    "\n",
+    "def display_events(sample):\n",
+    "    events = sample[\"events\"]\n",
+    "    grouped_events = create_grouped_events(events)\n",
+    "    original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
+    "    for thread in grouped_events:\n",
+    "        # Start a new thread\n",
+    "        thread_html = '<div class=\"thread\">'\n",
+    "        # Get the first event in the thread as a reference\n",
+    "        first_event = thread[0]\n",
+    "        \n",
+    "        # Add shared parts of the events only once\n",
+    "        text = f\"\"\"\n",
+    "        <div class=\"event\">\n",
+    "            <table style=\"width:100%; border: 0;\">\n",
+    "                <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
+    "                <tr><td><strong>User</strong></td><td>{first_event['user.login']} (type :<strong>{first_event['user.type']}</strong>)</td></tr>\n",
+    "                <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
+    "                <tr><td><strong>From Head</strong></td><td>{first_event['pull_request.head.label']}</td></tr>\n",
+    "            </table>\n",
+    "        </div>\n",
+    "        \"\"\"\n",
+    "        thread_html += text\n",
+    "        \n",
+    "        # Add the bodies of the comments for each event in the thread\n",
+    "        for event in thread:\n",
+    "            if event['comment.body']:\n",
+    "                is_op = original_poster == event['user.login']\n",
+    "                thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n",
+    "        thread_html += '</div>'\n",
+    "        st.markdown(thread_html, unsafe_allow_html=True)\n",
+    "        if first_event['comment.path']:\n",
+    "            path_html = f\"<code style='font-family: monospace; font-weight: bold;'>Path:</code> {first_event['comment.path']}\"\n",
+    "            st.markdown(path_html, unsafe_allow_html=True)\n",
+    "        if first_event[\"comment.diff_hunk\"]:\n",
+    "            focus_diff, full_diff = show_diff_hunk(first_event[\"comment.diff_hunk\"], first_event[\"comment.original_position\"])\n",
+    "            if not focus_diff:\n",
+    "                components.html(full_diff)\n",
+    "            else:\n",
+    "                components.html(focus_diff)\n",
+    "                with st.expander(\"View Full diff hunk\"):\n",
+    "                    components.html(full_diff)\n",
+    "        st.markdown(\"---\")\n",
+    "\n",
+    "def custom_css():\n",
+    "    st.markdown(\"\"\"\n",
+    "    <style>\n",
+    "        .thread {\n",
+    "            border: 1px solid #ccc;\n",
+    "            padding: 10px;\n",
+    "            margin: 10px;\n",
+    "            background-color: #f9f9f9;\n",
+    "            width: 100%;\n",
+    "        }\n",
+    "        .event {\n",
+    "            border-bottom: 1px solid #eee;\n",
+    "            padding: 10px;\n",
+    "            width: 100%;\n",
+    "        }\n",
+    "        .event-header {\n",
+    "            font-size: 0.9em;\n",
+    "            color: #666;\n",
+    "        }\n",
+    "    </style>\n",
+    "    \"\"\", unsafe_allow_html=True)\n",
+    "\n",
+    "custom_css()\n",
+    "\n",
+    "\n",
+    "#st.set_page_config(page_icon=\":laptop:\", layout=\"wide\")\n",
+    "st.markdown(f\"\"\"\\\n",
+    "    # GitHub Code Reviews Inspection 🔍\n",
+    "    In this space you can inspect code reviews from GitHUb Pull Requests. Note that some may have empty text (e.g approval of a PR without a code comment).\n",
+    "    You can find the dataset at [bigcode/code_reviews_sample](https://huggingface.co/datasets/bigcode/code_reviews_sample)\n",
+    "    \"\"\"\n",
+    "    )\n",
+    "example_index = st.number_input(f\"Example (0 to {size-1}):\", min_value=0, max_value=size-1, value=0, step=1)\n",
+    "\n",
+    "header, pr_info_html = get_pr_info(ds[example_index])\n",
+    "st.subheader(\"PR information\")\n",
+    "st.markdown(header, unsafe_allow_html=True)\n",
+    "st.markdown(pr_info_html, unsafe_allow_html=True)\n",
+    "st.markdown(\"<br>\", unsafe_allow_html=True)\n",
+    "st.subheader(\"Code review events\")\n",
+    "event_blocks = display_events(ds[example_index])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ValueError: The features can't be aligned because the key pull_request_info of features {'pull_request.guid': Value(dtype='string', id=None), 'pull_request.code_review_events': Value(dtype='string', id=None), 'pull_request.events': Value(dtype='string', id=None), 'pull_request.issue_events': Value(dtype='string', id=None), 'bucket': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'pull_request_info': {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)}, 'head_repo_info': {'pull_request.head.label': Value(dtype='string', id=None), 'pull_request.head.ref': Value(dtype='string', id=None), 'pull_request.head.repo.default_branch': Value(dtype='string', id=None), 'pull_request.head.repo.description': Value(dtype='null', id=None), 'pull_request.head.repo.homepage': Value(dtype='null', id=None), 'pull_request.head.repo.language': Value(dtype='string', id=None), 'pull_request.head.repo.license.name': Value(dtype='null', id=None), 'pull_request.head.repo.name': Value(dtype='string', id=None), 'pull_request.head.repo.owner.login': Value(dtype='string', id=None), 'pull_request.head.repo.owner.type': Value(dtype='string', id=None), 'pull_request.head.repo.private': Value(dtype='bool', id=None), 'pull_request.head.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.head.sha': Value(dtype='string', id=None), 'pull_request.head.user.login': Value(dtype='string', id=None), 'pull_request.head.user.type': Value(dtype='string', id=None)}, 'base_repo_info': {'pull_request.base.label': Value(dtype='string', id=None), 'pull_request.base.ref': Value(dtype='string', id=None), 'pull_request.base.repo.default_branch': Value(dtype='string', id=None), 'pull_request.base.repo.description': Value(dtype='null', id=None), 'pull_request.base.repo.forks_count': Value(dtype='int64', id=None), 'pull_request.base.repo.homepage': Value(dtype='null', id=None), 'pull_request.base.repo.language': Value(dtype='string', id=None), 'pull_request.base.repo.license.name': Value(dtype='null', id=None), 'pull_request.base.repo.name': Value(dtype='string', id=None), 'pull_request.base.repo.open_issues_count': Value(dtype='int64', id=None), 'pull_request.base.repo.owner.login': Value(dtype='string', id=None), 'pull_request.base.repo.owner.type': Value(dtype='string', id=None), 'pull_request.base.repo.private': Value(dtype='bool', id=None), 'pull_request.base.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.base.repo.watchers_count': Value(dtype='int64', id=None), 'pull_request.base.sha': Value(dtype='string', id=None), 'pull_request.base.user.login': Value(dtype='string', id=None), 'pull_request.base.user.type': Value(dtype='string', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.label.name': Value(dtype='null', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None)}, 'events': [{'action': Value(dtype='string', id=None), 'created_at': Value(dtype='timestamp[us, tz=UTC]', id=None), 'issues_comments': {'action': Value(dtype='string', id=None), 'author': Value(dtype='null', id=None), 'comment': Value(dtype='null', id=None), 'comment_id': Value(dtype='null', id=None), 'datetime': Value(dtype='null', id=None), 'type': Value(dtype='string', id=None)}, 'review_comments': {'actor.id': Value(dtype='int64', id=None), 'actor.login': Value(dtype='string', id=None), 'comment.author_association': Value(dtype='null', id=None), 'comment.body': Value(dtype='null', id=None), 'comment.commit_id': Value(dtype='null', id=None), 'comment.created_at': Value(dtype='null', id=None), 'comment.diff_hunk': Value(dtype='null', id=None), 'comment.id': Value(dtype='null', id=None), 'comment.in_reply_to_id': Value(dtype='null', id=None), 'comment.line': Value(dtype='null', id=None), 'comment.original_commit_id': Value(dtype='null', id=None), 'comment.original_line': Value(dtype='null', id=None), 'comment.original_position': Value(dtype='null', id=None), 'comment.original_start_line': Value(dtype='null', id=None), 'comment.path': Value(dtype='null', id=None), 'comment.position': Value(dtype='null', id=None), 'comment.side': Value(dtype='null', id=None), 'comment.start_line': Value(dtype='null', id=None), 'comment.start_side': Value(dtype='null', id=None), 'comment.updated_at': Value(dtype='null', id=None), 'review.author_association': Value(dtype='null', id=None), 'review.body': Value(dtype='null', id=None), 'review.commit_id': Value(dtype='null', id=None), 'review.id': Value(dtype='null', id=None), 'review.state': Value(dtype='null', id=None), 'review.submitted_at': Value(dtype='null', id=None), 'user.login': Value(dtype='null', id=None), 'user.type': Value(dtype='null', id=None)}, 'type': Value(dtype='string', id=None)}]} has unexpected type - {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} (expected either {'org.id': Value(dtype='null', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} or Value(\"null\").\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'type': 'PullRequestEvent',\n",
+       "  'action': 'opened',\n",
+       "  'actor.login': 'dependabot[bot]',\n",
+       "  'actor.id': 49699333,\n",
+       "  'user.login': None,\n",
+       "  'user.id': None,\n",
+       "  'user.type': None,\n",
+       "  'repo.name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n",
+       "  'repo.id': 210433834,\n",
+       "  'public': True,\n",
+       "  'created_at': '2022-12-10T03:27:08Z',\n",
+       "  'org.id': None,\n",
+       "  'org.login': None,\n",
+       "  'pull_request.id': 1157080683,\n",
+       "  'pull_request.number': 35,\n",
+       "  'pull_request.state': 'open',\n",
+       "  'pull_request.title': 'Bump qs from 6.5.2 to 6.5.3',\n",
+       "  'pull_request.body': 'Bumps [qs](https://github.com/ljharb/qs) from 6.5.2 to 6.5.3.\\n<details>\\n<summary>Changelog</summary>\\n<p><em>Sourced from <a href=\"https://github.com/ljharb/qs/blob/main/CHANGELOG.md\">qs\\'s changelog</a>.</em></p>\\n<blockquote>\\n<h2><strong>6.5.3</strong></h2>\\n<ul>\\n<li>[Fix] <code>parse</code>: ignore <code>__proto__</code> keys (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/428\">#428</a>)</li>\\n<li>[Fix] <code>utils.merge</code>: avoid a crash with a null target and a truthy non-array source</li>\\n<li>[Fix] correctly parse nested arrays</li>\\n<li>[Fix] <code>stringify</code>: fix a crash with <code>strictNullHandling</code> and a custom <code>filter</code>/<code>serializeDate</code> (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/279\">#279</a>)</li>\\n<li>[Fix] <code>utils</code>: <code>merge</code>: fix crash when <code>source</code> is a truthy primitive &amp; no options are provided</li>\\n<li>[Fix] when <code>parseArrays</code> is false, properly handle keys ending in <code>[]</code></li>\\n<li>[Fix] fix for an impossible situation: when the formatter is called with a non-string value</li>\\n<li>[Fix] <code>utils.merge</code>: avoid a crash with a null target and an array source</li>\\n<li>[Refactor] <code>utils</code>: reduce observable [[Get]]s</li>\\n<li>[Refactor] use cached <code>Array.isArray</code></li>\\n<li>[Refactor] <code>stringify</code>: Avoid arr = arr.concat(...), push to the existing instance (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/269\">#269</a>)</li>\\n<li>[Refactor] <code>parse</code>: only need to reassign the var once</li>\\n<li>[Robustness] <code>stringify</code>: avoid relying on a global <code>undefined</code> (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/427\">#427</a>)</li>\\n<li>[readme] remove travis badge; add github actions/codecov badges; update URLs</li>\\n<li>[Docs] Clean up license text so it’s properly detected as BSD-3-Clause</li>\\n<li>[Docs] Clarify the need for &quot;arrayLimit&quot; option</li>\\n<li>[meta] fix README.md (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/399\">#399</a>)</li>\\n<li>[meta] add FUNDING.yml</li>\\n<li>[actions] backport actions from main</li>\\n<li>[Tests] always use <code>String(x)</code> over <code>x.toString()</code></li>\\n<li>[Tests] remove nonexistent tape option</li>\\n<li>[Dev Deps] backport from main</li>\\n</ul>\\n</blockquote>\\n</details>\\n<details>\\n<summary>Commits</summary>\\n<ul>\\n<li><a href=\"https://github.com/ljharb/qs/commit/298bfa55d6db00ddea78dd0333509aadf9bb3077\"><code>298bfa5</code></a> v6.5.3</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/ed0f5dcbef4b168a8ae299d78b1e4a2e9b1baf1f\"><code>ed0f5dc</code></a> [Fix] <code>parse</code>: ignore <code>__proto__</code> keys (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/428\">#428</a>)</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/691e739cfa40cd42604dc05a54e6154371a429ab\"><code>691e739</code></a> [Robustness] <code>stringify</code>: avoid relying on a global <code>undefined</code> (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/427\">#427</a>)</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/1072d57d38a690e1ad7616dced44390bffedcbb2\"><code>1072d57</code></a> [readme] remove travis badge; add github actions/codecov badges; update URLs</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/12ac1c403aaa04d1a34844f514ed9f9abfb76e64\"><code>12ac1c4</code></a> [meta] fix README.md (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/399\">#399</a>)</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/0338716b09fdbd4711823eeb0a14e556a2498e7a\"><code>0338716</code></a> [actions] backport actions from main</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/5639c20ce0a7c1332200a3181339331483e5a3a1\"><code>5639c20</code></a> Clean up license text so it’s properly detected as BSD-3-Clause</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/51b8a0b1b213596dd1702b837f5e7dec2229793d\"><code>51b8a0b</code></a> add FUNDING.yml</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/45f675936e742d92fac8d4dae5cfc385c576a977\"><code>45f6759</code></a> [Fix] fix for an impossible situation: when the formatter is called with a no...</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/f814a7f8f2af059f8158f7e4b2bf8b46aeb62cd3\"><code>f814a7f</code></a> [Dev Deps] backport from main</li>\\n<li>Additional commits viewable in <a href=\"https://github.com/ljharb/qs/compare/v6.5.2...v6.5.3\">compare view</a></li>\\n</ul>\\n</details>\\n<br />\\n\\n\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=qs&package-manager=npm_and_yarn&previous-version=6.5.2&new-version=6.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\n\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\n\\n[//]: # (dependabot-automerge-start)\\n[//]: # (dependabot-automerge-end)\\n\\n---\\n\\n<details>\\n<summary>Dependabot commands and options</summary>\\n<br />\\n\\nYou can trigger Dependabot actions by commenting on this PR:\\n- `@dependabot rebase` will rebase this PR\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\n- `@dependabot merge` will merge this PR after your CI passes on it\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\n- `@dependabot reopen` will reopen this PR if it is closed\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\n\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/AbdElrahmanMuhammedNasr/WuzuufMasr/network/alerts).\\n\\n</details>',\n",
+       "  'pull_request.user.login': 'dependabot[bot]',\n",
+       "  'pull_request.user.id': 49699333,\n",
+       "  'pull_request.author_association': 'NONE',\n",
+       "  'pull_request.created_at': '2022-12-10T03:27:08Z',\n",
+       "  'pull_request.updated_at': '2022-12-10T03:27:08Z',\n",
+       "  'pull_request.closed_at': None,\n",
+       "  'pull_request.merged_at': None,\n",
+       "  'pull_request.merge_commit_sha': None,\n",
+       "  'pull_request.locked': False,\n",
+       "  'pull_request.assignee.login': None,\n",
+       "  'pull_request.assignee.id': None,\n",
+       "  'pull_request.assignee.type': None,\n",
+       "  'pull_request.assignee.site_admin': None,\n",
+       "  'pull_request.milestone.id': None,\n",
+       "  'pull_request.milestone.number': None,\n",
+       "  'pull_request.milestone.title': None,\n",
+       "  'pull_request.milestone.description': None,\n",
+       "  'pull_request.milestone.creator.login': None,\n",
+       "  'pull_request.milestone.creator.id': None,\n",
+       "  'pull_request.milestone.creator.type': None,\n",
+       "  'pull_request.milestone.creator.site_admin': None,\n",
+       "  'pull_request.milestone.open_issues': None,\n",
+       "  'pull_request.milestone.closed_issues': None,\n",
+       "  'pull_request.milestone.state': None,\n",
+       "  'pull_request.milestone.created_at': None,\n",
+       "  'pull_request.milestone.updated_at': None,\n",
+       "  'pull_request.milestone.due_on': None,\n",
+       "  'pull_request.milestone.closed_at': None,\n",
+       "  'pull_request.merged': False,\n",
+       "  'pull_request.mergeable': None,\n",
+       "  'pull_request.mergeable_state': 'unknown',\n",
+       "  'pull_request.merged_by.login': None,\n",
+       "  'pull_request.merged_by.id': None,\n",
+       "  'pull_request.merged_by.type': None,\n",
+       "  'pull_request.merged_by.site_admin': None,\n",
+       "  'pull_request.comments': 0,\n",
+       "  'pull_request.review_comments': 0,\n",
+       "  'pull_request.commits': 1,\n",
+       "  'pull_request.additions': 3,\n",
+       "  'pull_request.deletions': 3,\n",
+       "  'pull_request.changed_files': 1,\n",
+       "  'pull_request.label.id': None,\n",
+       "  'pull_request.label.name': None,\n",
+       "  'pull_request.label.color': None,\n",
+       "  'pull_request.label.default': None,\n",
+       "  'pull_request.head.label': 'AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3',\n",
+       "  'pull_request.head.ref': 'dependabot/npm_and_yarn/qs-6.5.3',\n",
+       "  'pull_request.head.sha': '94469b10a02fa77e95bb22aaa0fbcc16ef03edfd',\n",
+       "  'pull_request.head.user.login': 'AbdElrahmanMuhammedNasr',\n",
+       "  'pull_request.head.user.type': 'User',\n",
+       "  'pull_request.head.repo.name': 'WuzuufMasr',\n",
+       "  'pull_request.head.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n",
+       "  'pull_request.head.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n",
+       "  'pull_request.head.repo.owner.type': 'User',\n",
+       "  'pull_request.head.repo.private': False,\n",
+       "  'pull_request.head.repo.homepage': None,\n",
+       "  'pull_request.head.repo.description': None,\n",
+       "  'pull_request.head.repo.fork': False,\n",
+       "  'pull_request.head.repo.created_at': '2019-09-23T19:17:51Z',\n",
+       "  'pull_request.head.repo.updated_at': '2019-10-11T19:57:45Z',\n",
+       "  'pull_request.head.repo.pushed_at': '2022-12-10T03:27:07Z',\n",
+       "  'pull_request.head.repo.size': 1345,\n",
+       "  'pull_request.head.repo.stargazers_count': 0,\n",
+       "  'pull_request.head.repo.watchers_count': 0,\n",
+       "  'pull_request.head.repo.language': 'TypeScript',\n",
+       "  'pull_request.head.repo.has_issues': True,\n",
+       "  'pull_request.head.repo.has_projects': True,\n",
+       "  'pull_request.head.repo.has_downloads': True,\n",
+       "  'pull_request.head.repo.has_wiki': True,\n",
+       "  'pull_request.head.repo.has_pages': False,\n",
+       "  'pull_request.head.repo.forks_count': 0,\n",
+       "  'pull_request.head.repo.archived': False,\n",
+       "  'pull_request.head.repo.disabled': False,\n",
+       "  'pull_request.head.repo.open_issues_count': 24,\n",
+       "  'pull_request.head.repo.forks': 0,\n",
+       "  'pull_request.head.repo.open_issues': 24,\n",
+       "  'pull_request.head.repo.watchers': 0,\n",
+       "  'pull_request.head.repo.default_branch': 'master',\n",
+       "  'pull_request.head.repo.license.key': None,\n",
+       "  'pull_request.head.repo.license.spdx_id': None,\n",
+       "  'pull_request.head.repo.license.name': None,\n",
+       "  'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n",
+       "  'pull_request.base.ref': 'master',\n",
+       "  'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n",
+       "  'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n",
+       "  'pull_request.base.user.type': 'User',\n",
+       "  'pull_request.base.repo.name': 'WuzuufMasr',\n",
+       "  'pull_request.base.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n",
+       "  'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n",
+       "  'pull_request.base.repo.owner.type': 'User',\n",
+       "  'pull_request.base.repo.private': False,\n",
+       "  'pull_request.base.repo.homepage': None,\n",
+       "  'pull_request.base.repo.description': None,\n",
+       "  'pull_request.base.repo.fork': False,\n",
+       "  'pull_request.base.repo.created_at': '2019-09-23T19:17:51Z',\n",
+       "  'pull_request.base.repo.updated_at': '2019-10-11T19:57:45Z',\n",
+       "  'pull_request.base.repo.pushed_at': '2022-12-10T03:27:07Z',\n",
+       "  'pull_request.base.repo.size': 1345,\n",
+       "  'pull_request.base.repo.stargazers_count': 0,\n",
+       "  'pull_request.base.repo.watchers_count': 0,\n",
+       "  'pull_request.base.repo.language': 'TypeScript',\n",
+       "  'pull_request.base.repo.has_issues': True,\n",
+       "  'pull_request.base.repo.has_projects': True,\n",
+       "  'pull_request.base.repo.has_downloads': True,\n",
+       "  'pull_request.base.repo.has_wiki': True,\n",
+       "  'pull_request.base.repo.has_pages': False,\n",
+       "  'pull_request.base.repo.forks_count': 0,\n",
+       "  'pull_request.base.repo.archived': False,\n",
+       "  'pull_request.base.repo.disabled': False,\n",
+       "  'pull_request.base.repo.open_issues_count': 24,\n",
+       "  'pull_request.base.repo.forks': 0,\n",
+       "  'pull_request.base.repo.open_issues': 24,\n",
+       "  'pull_request.base.repo.watchers': 0,\n",
+       "  'pull_request.base.repo.default_branch': 'master',\n",
+       "  'pull_request.base.repo.license.key': None,\n",
+       "  'pull_request.base.repo.license.spdx_id': None,\n",
+       "  'pull_request.base.repo.license.name': None,\n",
+       "  'pull_request.guid': 'AbdElrahmanMuhammedNasr/WuzuufMasr/pull/35'}]"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "res = json.loads(small_ds[0]['pull_request.events'])\n",
+    "res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'action': 'opened',\n",
+       "  'author': 'hillc-usgs',\n",
+       "  'comment': None,\n",
+       "  'comment_id': None,\n",
+       "  'datetime': '2021-06-24T17:23:03Z',\n",
+       "  'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
+       "  'title': 'pygeoapi_plugins refit',\n",
+       "  'type': 'issue'},\n",
+       " {'action': 'created',\n",
+       "  'author': 'rmcd-mscb',\n",
+       "  'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \",\n",
+       "  'comment_id': 868826717.0,\n",
+       "  'datetime': '2021-06-25 20:51:35+00:00',\n",
+       "  'description': None,\n",
+       "  'title': None,\n",
+       "  'type': 'comment'}]"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues = issues[0][\"events\"]\n",
+    "issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in range(3, 20):\n",
+    "    row = small_ds[i]\n",
+    "    events = load_json(row[\"pull_request.events\"])\n",
+    "    reviews = load_json(row[\"pull_request.code_review_events\"])\n",
+    "    issues = load_json(row[\"pull_request.issue_events\"])\n",
+    "    if reviews:\n",
+    "        print(i)\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "len events 2, len reviews 1, len issues 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "row = small_ds[10]\n",
+    "events = load_json(row[\"pull_request.events\"])\n",
+    "reviews = load_json(row[\"pull_request.code_review_events\"])\n",
+    "issues = load_json(row[\"pull_request.issue_events\"])\n",
+    "print(f\"len events {len(events)}, len reviews {len(reviews)}, len issues {len(issues)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "events = load_json(row[\"pull_request.events\"])\n",
+    "reviews = load_json(row[\"pull_request.code_review_events\"])\n",
+    "issues = load_json(row[\"pull_request.issue_events\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "L = events + reviews + issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 130,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "events = load_json(row[\"pull_request.events\"])\n",
+    "reviews = load_json(row[\"pull_request.code_review_events\"])\n",
+    "issues = load_json(row[\"pull_request.issue_events\"])\n",
+    "assert len(issues) == 1\n",
+    "issues_events = issues[0][\"events\"]\n",
+    "# for each events in each category group all events sorted by \"created_at\" in one list\n",
+    "for e in issues_events:\n",
+    "    e[\"created_at\"] = parse(e[\"datetime\"])\n",
+    "    del e[\"datetime\"]\n",
+    "events = [update_datetime(e) for e in events]\n",
+    "reviews = [update_datetime(e) for e in reviews]\n",
+    "all_events = sorted(\n",
+    "    events + reviews + issues_events,\n",
+    "    key=lambda x: x[\"created_at\"]\n",
+    ")\n",
+    "\n",
+    "pr_info = {k: events[0][k] for k in pull_request_info_cols}\n",
+    "head_info = {k: events[0][k] for k in head_info_cols}\n",
+    "base_info = {k:  events[0][k] for k in base_info_cols}\n",
+    "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
+    "comments = [{\"type\": e[\"type\"],\n",
+    "            \"action\": e[\"action\"],\n",
+    "            \"created_at\": e[\"created_at\"],\n",
+    "            \"review_comments\":  get_review_info(e),\n",
+    "            \"issues_comments\": get_issue_info(e)} for e in all_events]\n",
+    "new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['pull_request_info', 'head_repo_info', 'base_repo_info', 'events'])"
+      ]
+     },
+     "execution_count": 131,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_row.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "**GitHub Repo**: ACWI-SSWD/nldi_flowtools, PR Number: 4, ID: 677298606\n",
+      "**GitHub Link**: https://github.com/ACWI-SSWD/nldi_flowtools/pull/4\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "Type: issue, action: opened, created_at: 2021-06-24 17:23:03+00:00\n",
+      "Author hillc-usgs did opened:\n",
+      "None\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "Type: PullRequestEvent, action: opened, created_at: 2021-06-24 17:23:04+00:00\n",
+      "Author hillc-usgs with association None did opened\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "Type: PullRequestReviewEvent, action: created, created_at: 2021-06-25 20:50:41+00:00\n",
+      "Author rmcd-mscb with association NONE did created\n",
+      "Review:\n",
+      "Thanks Cliff - Anders has been out this week, to keep things moving I'll merge the request and leave the branch for him to view when he gets back.  \n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "Type: PullRequestEvent, action: closed, created_at: 2021-06-25 20:50:54+00:00\n",
+      "Author rmcd-mscb with association None did closed\n",
+      "----------------------------------------------------------------------------------------------------\n",
+      "Type: comment, action: created, created_at: 2021-06-25 20:51:35+00:00\n",
+      "Author rmcd-mscb did created:\n",
+      "@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \n"
+     ]
+    }
+   ],
+   "source": [
+    "pr_info = new_row[\"pull_request_info\"]\n",
+    "res = f\"**GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}\"\n",
+    "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
+    "res += f\"\\n**GitHub Link**: {gh_link}\"\n",
+    "print(res)\n",
+    "for i in range(len(new_row[\"events\"])):\n",
+    "    e = new_row[\"events\"][i]\n",
+    "    print(\"-\" * 100)\n",
+    "    print(f\"Type: {e['type']}, action: {e['action']}, created_at: {e['created_at']}\")\n",
+    "    action = e['action']\n",
+    "\n",
+    "    if e['type'] in [\"issue\", \"comment\"]:\n",
+    "        e = e[\"issues_comments\"]\n",
+    "        print(f\"Author {e['author']} did {e['action']}:\\n{e['comment']}\")\n",
+    "\n",
+    "    elif e['type'] in [\"PullRequestEvent\", \"PullRequestReviewCommentEvent\", \"PullRequestReviewEvent\"]:\n",
+    "        reviews = e[\"review_comments\"]\n",
+    "        print(f\"Author {reviews['actor.login']} with association {reviews['review.author_association']} did {action}\")\n",
+    "        if reviews['review.body']:\n",
+    "            print(f\"Review:\\n{reviews['review.body']}\")\n",
+    "        if reviews['comment.body']:\n",
+    "            print(f\"Comment:\\n{reviews['comment.body']}\")\n",
+    "        if reviews['comment.diff_hunk']:\n",
+    "            print(f\"Diff hunk:\\n{reviews['diff_hunk']}\")\n",
+    "            print(f\"File path {reviews['path']}\")\n",
+    "    else:\n",
+    "        print(\"OTHER\")\n",
+    "        print(e[\"type\"])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 144,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'pull_request.base.label': 'ACWI-SSWD:master',\n",
+       " 'pull_request.base.ref': 'master',\n",
+       " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n",
+       " 'pull_request.base.user.login': 'ACWI-SSWD',\n",
+       " 'pull_request.base.user.type': 'Organization',\n",
+       " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n",
+       " 'pull_request.base.repo.owner.type': 'Organization',\n",
+       " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
+       " 'pull_request.base.repo.default_branch': 'master',\n",
+       " 'pull_request.base.repo.description': None,\n",
+       " 'pull_request.base.repo.language': 'Python',\n",
+       " 'pull_request.base.repo.watchers_count': 3,\n",
+       " 'pull_request.base.repo.open_issues_count': 1,\n",
+       " 'pull_request.base.repo.forks_count': 0,\n",
+       " 'pull_request.base.repo.name': 'nldi_flowtools',\n",
+       " 'pull_request.base.repo.homepage': None,\n",
+       " 'pull_request.base.repo.stargazers_count': 3,\n",
+       " 'pull_request.base.repo.private': False,\n",
+       " 'pull_request.comments': 0,\n",
+       " 'pull_request.review_comments': 0,\n",
+       " 'pull_request.label.name': None}"
+      ]
+     },
+     "execution_count": 144,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pr_info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 145,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'type': 'PullRequestEvent',\n",
+       "  'action': 'opened',\n",
+       "  'actor.login': 'hillc-usgs',\n",
+       "  'actor.id': 84474574,\n",
+       "  'user.login': None,\n",
+       "  'user.id': None,\n",
+       "  'user.type': None,\n",
+       "  'repo.name': 'ACWI-SSWD/nldi_flowtools',\n",
+       "  'repo.id': 365244721,\n",
+       "  'public': True,\n",
+       "  'created_at': datetime.datetime(2021, 6, 24, 17, 23, 4, tzinfo=tzlocal()),\n",
+       "  'org.id': 17301770,\n",
+       "  'org.login': 'ACWI-SSWD',\n",
+       "  'pull_request.id': 677298606,\n",
+       "  'pull_request.number': 4,\n",
+       "  'pull_request.state': 'open',\n",
+       "  'pull_request.title': 'pygeoapi_plugins refit',\n",
+       "  'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
+       "  'pull_request.user.login': 'hillc-usgs',\n",
+       "  'pull_request.user.id': 84474574,\n",
+       "  'pull_request.author_association': 'NONE',\n",
+       "  'pull_request.created_at': '2021-06-24T17:23:03Z',\n",
+       "  'pull_request.updated_at': '2021-06-24T17:23:03Z',\n",
+       "  'pull_request.closed_at': None,\n",
+       "  'pull_request.merged_at': None,\n",
+       "  'pull_request.merge_commit_sha': None,\n",
+       "  'pull_request.locked': False,\n",
+       "  'pull_request.assignee.login': None,\n",
+       "  'pull_request.assignee.id': None,\n",
+       "  'pull_request.assignee.type': None,\n",
+       "  'pull_request.assignee.site_admin': None,\n",
+       "  'pull_request.milestone.id': None,\n",
+       "  'pull_request.milestone.number': None,\n",
+       "  'pull_request.milestone.title': None,\n",
+       "  'pull_request.milestone.description': None,\n",
+       "  'pull_request.milestone.creator.login': None,\n",
+       "  'pull_request.milestone.creator.id': None,\n",
+       "  'pull_request.milestone.creator.type': None,\n",
+       "  'pull_request.milestone.creator.site_admin': None,\n",
+       "  'pull_request.milestone.open_issues': None,\n",
+       "  'pull_request.milestone.closed_issues': None,\n",
+       "  'pull_request.milestone.state': None,\n",
+       "  'pull_request.milestone.created_at': None,\n",
+       "  'pull_request.milestone.updated_at': None,\n",
+       "  'pull_request.milestone.due_on': None,\n",
+       "  'pull_request.milestone.closed_at': None,\n",
+       "  'pull_request.merged': False,\n",
+       "  'pull_request.mergeable': None,\n",
+       "  'pull_request.mergeable_state': 'unknown',\n",
+       "  'pull_request.merged_by.login': None,\n",
+       "  'pull_request.merged_by.id': None,\n",
+       "  'pull_request.merged_by.type': None,\n",
+       "  'pull_request.merged_by.site_admin': None,\n",
+       "  'pull_request.comments': 0,\n",
+       "  'pull_request.review_comments': 0,\n",
+       "  'pull_request.commits': 5,\n",
+       "  'pull_request.additions': 321,\n",
+       "  'pull_request.deletions': 25,\n",
+       "  'pull_request.changed_files': 5,\n",
+       "  'pull_request.label.id': None,\n",
+       "  'pull_request.label.name': None,\n",
+       "  'pull_request.label.color': None,\n",
+       "  'pull_request.label.default': None,\n",
+       "  'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n",
+       "  'pull_request.head.ref': 'pygeoapi_plugins-refit',\n",
+       "  'pull_request.head.sha': '9143699913269aff0814979d932957efeb002eb1',\n",
+       "  'pull_request.head.user.login': 'ACWI-SSWD',\n",
+       "  'pull_request.head.user.type': 'Organization',\n",
+       "  'pull_request.head.repo.name': 'nldi_flowtools',\n",
+       "  'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
+       "  'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n",
+       "  'pull_request.head.repo.owner.type': 'Organization',\n",
+       "  'pull_request.head.repo.private': False,\n",
+       "  'pull_request.head.repo.homepage': None,\n",
+       "  'pull_request.head.repo.description': None,\n",
+       "  'pull_request.head.repo.fork': False,\n",
+       "  'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n",
+       "  'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n",
+       "  'pull_request.head.repo.pushed_at': '2021-06-24T15:15:30Z',\n",
+       "  'pull_request.head.repo.size': 4309,\n",
+       "  'pull_request.head.repo.stargazers_count': 3,\n",
+       "  'pull_request.head.repo.watchers_count': 3,\n",
+       "  'pull_request.head.repo.language': 'Python',\n",
+       "  'pull_request.head.repo.has_issues': True,\n",
+       "  'pull_request.head.repo.has_projects': True,\n",
+       "  'pull_request.head.repo.has_downloads': True,\n",
+       "  'pull_request.head.repo.has_wiki': True,\n",
+       "  'pull_request.head.repo.has_pages': False,\n",
+       "  'pull_request.head.repo.forks_count': 0,\n",
+       "  'pull_request.head.repo.archived': False,\n",
+       "  'pull_request.head.repo.disabled': False,\n",
+       "  'pull_request.head.repo.open_issues_count': 1,\n",
+       "  'pull_request.head.repo.forks': 0,\n",
+       "  'pull_request.head.repo.open_issues': 1,\n",
+       "  'pull_request.head.repo.watchers': 3,\n",
+       "  'pull_request.head.repo.default_branch': 'master',\n",
+       "  'pull_request.head.repo.license.key': 'bsd-3-clause',\n",
+       "  'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n",
+       "  'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
+       "  'pull_request.base.label': 'ACWI-SSWD:master',\n",
+       "  'pull_request.base.ref': 'master',\n",
+       "  'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n",
+       "  'pull_request.base.user.login': 'ACWI-SSWD',\n",
+       "  'pull_request.base.user.type': 'Organization',\n",
+       "  'pull_request.base.repo.name': 'nldi_flowtools',\n",
+       "  'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
+       "  'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n",
+       "  'pull_request.base.repo.owner.type': 'Organization',\n",
+       "  'pull_request.base.repo.private': False,\n",
+       "  'pull_request.base.repo.homepage': None,\n",
+       "  'pull_request.base.repo.description': None,\n",
+       "  'pull_request.base.repo.fork': False,\n",
+       "  'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n",
+       "  'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n",
+       "  'pull_request.base.repo.pushed_at': '2021-06-24T15:15:30Z',\n",
+       "  'pull_request.base.repo.size': 4309,\n",
+       "  'pull_request.base.repo.stargazers_count': 3,\n",
+       "  'pull_request.base.repo.watchers_count': 3,\n",
+       "  'pull_request.base.repo.language': 'Python',\n",
+       "  'pull_request.base.repo.has_issues': True,\n",
+       "  'pull_request.base.repo.has_projects': True,\n",
+       "  'pull_request.base.repo.has_downloads': True,\n",
+       "  'pull_request.base.repo.has_wiki': True,\n",
+       "  'pull_request.base.repo.has_pages': False,\n",
+       "  'pull_request.base.repo.forks_count': 0,\n",
+       "  'pull_request.base.repo.archived': False,\n",
+       "  'pull_request.base.repo.disabled': False,\n",
+       "  'pull_request.base.repo.open_issues_count': 1,\n",
+       "  'pull_request.base.repo.forks': 0,\n",
+       "  'pull_request.base.repo.open_issues': 1,\n",
+       "  'pull_request.base.repo.watchers': 3,\n",
+       "  'pull_request.base.repo.default_branch': 'master',\n",
+       "  'pull_request.base.repo.license.key': 'bsd-3-clause',\n",
+       "  'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n",
+       "  'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
+       "  'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'},\n",
+       " {'type': 'PullRequestEvent',\n",
+       "  'action': 'closed',\n",
+       "  'actor.login': 'rmcd-mscb',\n",
+       "  'actor.id': 11791580,\n",
+       "  'user.login': None,\n",
+       "  'user.id': None,\n",
+       "  'user.type': None,\n",
+       "  'repo.name': 'ACWI-SSWD/nldi_flowtools',\n",
+       "  'repo.id': 365244721,\n",
+       "  'public': True,\n",
+       "  'created_at': datetime.datetime(2021, 6, 25, 20, 50, 54, tzinfo=tzlocal()),\n",
+       "  'org.id': 17301770,\n",
+       "  'org.login': 'ACWI-SSWD',\n",
+       "  'pull_request.id': 677298606,\n",
+       "  'pull_request.number': 4,\n",
+       "  'pull_request.state': 'closed',\n",
+       "  'pull_request.title': 'pygeoapi_plugins refit',\n",
+       "  'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
+       "  'pull_request.user.login': 'hillc-usgs',\n",
+       "  'pull_request.user.id': 84474574,\n",
+       "  'pull_request.author_association': 'NONE',\n",
+       "  'pull_request.created_at': '2021-06-24T17:23:03Z',\n",
+       "  'pull_request.updated_at': '2021-06-25T20:50:53Z',\n",
+       "  'pull_request.closed_at': '2021-06-25T20:50:53Z',\n",
+       "  'pull_request.merged_at': '2021-06-25T20:50:53Z',\n",
+       "  'pull_request.merge_commit_sha': 'c0a8e850c8e627b0474b9059582e7a61e5fd3699',\n",
+       "  'pull_request.locked': False,\n",
+       "  'pull_request.assignee.login': None,\n",
+       "  'pull_request.assignee.id': None,\n",
+       "  'pull_request.assignee.type': None,\n",
+       "  'pull_request.assignee.site_admin': None,\n",
+       "  'pull_request.milestone.id': None,\n",
+       "  'pull_request.milestone.number': None,\n",
+       "  'pull_request.milestone.title': None,\n",
+       "  'pull_request.milestone.description': None,\n",
+       "  'pull_request.milestone.creator.login': None,\n",
+       "  'pull_request.milestone.creator.id': None,\n",
+       "  'pull_request.milestone.creator.type': None,\n",
+       "  'pull_request.milestone.creator.site_admin': None,\n",
+       "  'pull_request.milestone.open_issues': None,\n",
+       "  'pull_request.milestone.closed_issues': None,\n",
+       "  'pull_request.milestone.state': None,\n",
+       "  'pull_request.milestone.created_at': None,\n",
+       "  'pull_request.milestone.updated_at': None,\n",
+       "  'pull_request.milestone.due_on': None,\n",
+       "  'pull_request.milestone.closed_at': None,\n",
+       "  'pull_request.merged': True,\n",
+       "  'pull_request.mergeable': None,\n",
+       "  'pull_request.mergeable_state': 'unknown',\n",
+       "  'pull_request.merged_by.login': 'rmcd-mscb',\n",
+       "  'pull_request.merged_by.id': 11791580,\n",
+       "  'pull_request.merged_by.type': 'User',\n",
+       "  'pull_request.merged_by.site_admin': False,\n",
+       "  'pull_request.comments': 0,\n",
+       "  'pull_request.review_comments': 0,\n",
+       "  'pull_request.commits': 7,\n",
+       "  'pull_request.additions': 292,\n",
+       "  'pull_request.deletions': 1,\n",
+       "  'pull_request.changed_files': 5,\n",
+       "  'pull_request.label.id': None,\n",
+       "  'pull_request.label.name': None,\n",
+       "  'pull_request.label.color': None,\n",
+       "  'pull_request.label.default': None,\n",
+       "  'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n",
+       "  'pull_request.head.ref': 'pygeoapi_plugins-refit',\n",
+       "  'pull_request.head.sha': '3e3fe0dfdfce5fe24c25231c3207c2d292b31165',\n",
+       "  'pull_request.head.user.login': 'ACWI-SSWD',\n",
+       "  'pull_request.head.user.type': 'Organization',\n",
+       "  'pull_request.head.repo.name': 'nldi_flowtools',\n",
+       "  'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
+       "  'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n",
+       "  'pull_request.head.repo.owner.type': 'Organization',\n",
+       "  'pull_request.head.repo.private': False,\n",
+       "  'pull_request.head.repo.homepage': None,\n",
+       "  'pull_request.head.repo.description': None,\n",
+       "  'pull_request.head.repo.fork': False,\n",
+       "  'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n",
+       "  'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n",
+       "  'pull_request.head.repo.pushed_at': '2021-06-25T20:50:53Z',\n",
+       "  'pull_request.head.repo.size': 4310,\n",
+       "  'pull_request.head.repo.stargazers_count': 3,\n",
+       "  'pull_request.head.repo.watchers_count': 3,\n",
+       "  'pull_request.head.repo.language': 'Python',\n",
+       "  'pull_request.head.repo.has_issues': True,\n",
+       "  'pull_request.head.repo.has_projects': True,\n",
+       "  'pull_request.head.repo.has_downloads': True,\n",
+       "  'pull_request.head.repo.has_wiki': True,\n",
+       "  'pull_request.head.repo.has_pages': False,\n",
+       "  'pull_request.head.repo.forks_count': 0,\n",
+       "  'pull_request.head.repo.archived': False,\n",
+       "  'pull_request.head.repo.disabled': False,\n",
+       "  'pull_request.head.repo.open_issues_count': 0,\n",
+       "  'pull_request.head.repo.forks': 0,\n",
+       "  'pull_request.head.repo.open_issues': 0,\n",
+       "  'pull_request.head.repo.watchers': 3,\n",
+       "  'pull_request.head.repo.default_branch': 'master',\n",
+       "  'pull_request.head.repo.license.key': 'bsd-3-clause',\n",
+       "  'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n",
+       "  'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
+       "  'pull_request.base.label': 'ACWI-SSWD:master',\n",
+       "  'pull_request.base.ref': 'master',\n",
+       "  'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n",
+       "  'pull_request.base.user.login': 'ACWI-SSWD',\n",
+       "  'pull_request.base.user.type': 'Organization',\n",
+       "  'pull_request.base.repo.name': 'nldi_flowtools',\n",
+       "  'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
+       "  'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n",
+       "  'pull_request.base.repo.owner.type': 'Organization',\n",
+       "  'pull_request.base.repo.private': False,\n",
+       "  'pull_request.base.repo.homepage': None,\n",
+       "  'pull_request.base.repo.description': None,\n",
+       "  'pull_request.base.repo.fork': False,\n",
+       "  'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n",
+       "  'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n",
+       "  'pull_request.base.repo.pushed_at': '2021-06-25T20:50:53Z',\n",
+       "  'pull_request.base.repo.size': 4310,\n",
+       "  'pull_request.base.repo.stargazers_count': 3,\n",
+       "  'pull_request.base.repo.watchers_count': 3,\n",
+       "  'pull_request.base.repo.language': 'Python',\n",
+       "  'pull_request.base.repo.has_issues': True,\n",
+       "  'pull_request.base.repo.has_projects': True,\n",
+       "  'pull_request.base.repo.has_downloads': True,\n",
+       "  'pull_request.base.repo.has_wiki': True,\n",
+       "  'pull_request.base.repo.has_pages': False,\n",
+       "  'pull_request.base.repo.forks_count': 0,\n",
+       "  'pull_request.base.repo.archived': False,\n",
+       "  'pull_request.base.repo.disabled': False,\n",
+       "  'pull_request.base.repo.open_issues_count': 0,\n",
+       "  'pull_request.base.repo.forks': 0,\n",
+       "  'pull_request.base.repo.open_issues': 0,\n",
+       "  'pull_request.base.repo.watchers': 3,\n",
+       "  'pull_request.base.repo.default_branch': 'master',\n",
+       "  'pull_request.base.repo.license.key': 'bsd-3-clause',\n",
+       "  'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n",
+       "  'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
+       "  'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'}]"
+      ]
+     },
+     "execution_count": 145,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "events"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2021-06-24T17:23:03Z'"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "events[0][\"created_at\"]\n",
+    "issues[0][\"events\"][0][\"datetime\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'action': 'created',\n",
+       " 'author': 'rmcd-mscb',\n",
+       " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \",\n",
+       " 'comment_id': 868826717.0,\n",
+       " 'datetime': '2021-06-25 20:51:35+00:00',\n",
+       " 'description': None,\n",
+       " 'title': None,\n",
+       " 'type': 'comment'}"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues[0][\"events\"][1]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues[0].keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_events = sorted(\n",
+    "    events + reviews + issues,\n",
+    "    key=lambda x: x[\"created_at\"]\n",
+    ")\n",
+    "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n",
+    "head_info = {k: all_events[-1][k] for k in head_info_cols}\n",
+    "base_info = {k:  all_events[-1][k] for k in base_info_cols}\n",
+    "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
+    "comments = [{\"comments\": e[\"pull_request.comments\"],\n",
+    "            \"review_comments\": e[\"pull_request.review_comments\"],\n",
+    "            \"extra_review_info\":  get_extra_review_info(e)} for e in all_events]\n",
+    "new_row = {\"pr_info\": pr_info, \"head_info\": head_info, \"base_info\": base_info, \"comments\": comments}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'created_at'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n",
+      "Cell \u001b[0;32mIn[36], line 106\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m    104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m    105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[0;32m--> 106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39;49m(\n\u001b[1;32m    107\u001b[0m     events \u001b[39m+\u001b[39;49m reviews \u001b[39m+\u001b[39;49m issues,\n\u001b[1;32m    108\u001b[0m     key\u001b[39m=\u001b[39;49m\u001b[39mlambda\u001b[39;49;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    109\u001b[0m )\n\u001b[1;32m    110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m    111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n",
+      "Cell \u001b[0;32mIn[36], line 108\u001b[0m, in \u001b[0;36mmerge_events.<locals>.<lambda>\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m    102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m    104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m    105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[1;32m    106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39m(\n\u001b[1;32m    107\u001b[0m     events \u001b[39m+\u001b[39m reviews \u001b[39m+\u001b[39m issues,\n\u001b[0;32m--> 108\u001b[0m     key\u001b[39m=\u001b[39m\u001b[39mlambda\u001b[39;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    109\u001b[0m )\n\u001b[1;32m    110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m    111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'created_at'"
+     ]
+    }
+   ],
+   "source": [
+    "new_row = merge_events(row)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'repo': 'ACWI-SSWD/nldi_flowtools',\n",
+       " 'org': 'ACWI-SSWD',\n",
+       " 'issue_id': 929448726,\n",
+       " 'issue_number': 4,\n",
+       " 'pull_request': {'number': 4.0,\n",
+       "  'repo': 'nldi_flowtools',\n",
+       "  'user_login': 'ACWI-SSWD'},\n",
+       " 'events': [{'action': 'opened',\n",
+       "   'author': 'hillc-usgs',\n",
+       "   'comment': None,\n",
+       "   'comment_id': None,\n",
+       "   'datetime': '2021-06-24T17:23:03Z',\n",
+       "   'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
+       "   'title': 'pygeoapi_plugins refit',\n",
+       "   'type': 'issue'},\n",
+       "  {'action': 'created',\n",
+       "   'author': 'rmcd-mscb',\n",
+       "   'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \",\n",
+       "   'comment_id': 868826717.0,\n",
+       "   'datetime': '2021-06-25 20:51:35+00:00',\n",
+       "   'description': None,\n",
+       "   'title': None,\n",
+       "   'type': 'comment'}]}"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "issues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# for each events in each category group all events sorted by \"created_at\" in one list\n",
+    "# then merge all three lists\n",
+    "# then sort by \"created_at\"\n",
+    "all_events = sorted(\n",
+    "    events + reviews + issues,\n",
+    "    key=lambda x: x[\"created_at\"]\n",
+    ")\n",
+    "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n",
+    "head_info = {k: all_events[-1][k] for k in head_info_cols}\n",
+    "base_info = {k:  all_events[-1][k] for k in base_info_cols}\n",
+    "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
+    "comments = [{\"comments\": e[\"pull_request.comments\"],\n",
+    "            \"review_comments\": e[\"pull_request.review_comments\"],\n",
+    "            \"extra_review_info\":  get_extra_review_info(e)} for e in all_events]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pull_request_info_cols = [\n",
+    "    \"repo.name\",\n",
+    "    \"repo.id\",\n",
+    "    \"org.id\",\n",
+    "    \"public\",\n",
+    "    \"pull_request.id\",\n",
+    "    \"pull_request.guid\",\n",
+    "    \"pull_request.number\",\n",
+    "    \"pull_request.title\",\n",
+    "    \"pull_request.body\",\n",
+    "    \"pull_request.state\",\n",
+    "    \"pull_request.user.login\",\n",
+    "    \"pull_request.user.id\",\n",
+    "    \"pull_request.created_at\",\n",
+    "    \"pull_request.closed_at\",\n",
+    "    \"pull_request.merged_at\",\n",
+    "    \"pull_request.merged_by.login\",\n",
+    "    \"pull_request.milestone.title\",\n",
+    "    \"pull_request.milestone.description\",\n",
+    "    \"pull_request.milestone.number\",\n",
+    "    # commits\n",
+    "    'pull_request.commits',\n",
+    "    'pull_request.additions',\n",
+    "    'pull_request.deletions',\n",
+    "    # changed files\n",
+    "    'pull_request.changed_files',\n",
+    "]\n",
+    "\n",
+    "comments = [\n",
+    "    'pull_request.comments',\n",
+    "    'pull_request.review_comments',\n",
+    "    # for PR event\n",
+    "    'pull_request.label.name',\n",
+    "    # review events only\n",
+    "    'review.state',\n",
+    "    'review.id', \n",
+    "    'review.body', \n",
+    "    'review.commit_id', \n",
+    "    'review.submitted_at', \n",
+    "    'review.author_association', '\n",
+    "]\n",
+    "\n",
+    "head_info_cols = [\n",
+    "    \"pull_request.head.label\",\n",
+    "    \"pull_request.head.ref\",\n",
+    "    \"pull_request.head.user.login\",\n",
+    "    \"pull_request.head.user.type\",\n",
+    "    \"pull_request.head.repo.owner.login\",\n",
+    "    \"pull_request.head.repo.owner.type\",\n",
+    "    \"pull_request.head.repo.license.name\",\n",
+    "    \"pull_request.head.sha\",\n",
+    "    'pull_request.head.repo.name',\n",
+    "    'pull_request.head.repo.owner.login',\n",
+    "    'pull_request.head.repo.homepage',\n",
+    "    'pull_request.head.repo.description',\n",
+    "    'pull_request.head.repo.language',\n",
+    "    'pull_request.head.repo.stargazers_count',\n",
+    "    'pull_request.head.repo.license.name',\n",
+    "    'pull_request.head.repo.default_branch',\n",
+    "    'pull_request.head.repo.private'\n",
+    "]\n",
+    "base_info_cols = [\n",
+    "    \"pull_request.base.label\",\n",
+    "    \"pull_request.base.ref\",\n",
+    "    \"pull_request.base.sha\",\n",
+    "    \"pull_request.base.user.login\",\n",
+    "    \"pull_request.base.user.type\",\n",
+    "    \"pull_request.base.repo.owner.login\",\n",
+    "    \"pull_request.base.repo.owner.type\",\n",
+    "    \"pull_request.base.repo.license.name\",\n",
+    "    \"pull_request.base.repo.default_branch\",\n",
+    "    \"pull_request.base.repo.description\",\n",
+    "    \"pull_request.base.repo.language\",\n",
+    "    \"pull_request.base.repo.watchers_count\",\n",
+    "    \"pull_request.base.repo.open_issues_count\",\n",
+    "    \"pull_request.base.repo.forks_count\",\n",
+    "    'pull_request.base.repo.name',\n",
+    "    'pull_request.base.repo.owner.login',\n",
+    "    'pull_request.base.repo.homepage',\n",
+    "    'pull_request.base.repo.description',\n",
+    "    'pull_request.base.repo.language',\n",
+    "    'pull_request.base.repo.stargazers_count',\n",
+    "    'pull_request.base.repo.private',\n",
+    "]\n",
+    "# drop \"repo.name\", \"repo.id\", \"public\" so they are not duplicated and keep relevant columns that might change\n",
+    "event_cols = [\n",
+    "    col\n",
+    "    for col in df.columns\n",
+    "    if (not col.startswith(\"pull_request.\"))\n",
+    "    and col not in [\"repo.name\", \"repo.id\", \"public\"]\n",
+    "] + [\n",
+    "    \"pull_request.head.label\",\n",
+    "    \"pull_request.head.ref\",\n",
+    "    \"pull_request.head.sha\",\n",
+    "    \"pull_request.title\",\n",
+    "]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.11.0 ('.venv': venv)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.0"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "0cc3054246fa39b40b564a97820c10836c9fb6acdf94e9196ea3a787cac26526"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From df0dc1b8ac081e3ca626bdf57bcb4954fafaaac9 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 21 Sep 2023 13:57:45 +0200
Subject: [PATCH 5/5] update

---
 .../pull-requests/reconstruct_prs.ipynb       | 4750 +----------------
 1 file changed, 28 insertions(+), 4722 deletions(-)

diff --git a/data_analysis/pull-requests/reconstruct_prs.ipynb b/data_analysis/pull-requests/reconstruct_prs.ipynb
index 9bc724c..db6a077 100644
--- a/data_analysis/pull-requests/reconstruct_prs.ipynb
+++ b/data_analysis/pull-requests/reconstruct_prs.ipynb
@@ -29,28 +29,6 @@
     "pip install python-dateutil"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 329,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading readme: 100%|██████████| 8.02k/8.02k [00:00<00:00, 1.52MB/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import json\n",
-    "import pandas as pd\n",
-    "from dateutil.parser import parse\n",
-    "from datasets import load_dataset, Dataset\n",
-    "\n",
-    "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 330,
@@ -69,17 +47,8 @@
     "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n",
     "\n",
     "# 10k subset of random samples from ds\n",
-    "fianl_ds = list(ds.take(size))\n",
-    "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 332,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))"
+    "ds = list(ds.take(size))\n",
+    "ds = Dataset.from_pandas(pd.DataFrame(data=ds))"
    ]
   },
   {
@@ -107,353 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import json\n",
-    "import pandas as pd\n",
-    "from dateutil.parser import parse\n",
-    "from datasets import load_dataset, Dataset\n",
-    "\n",
-    "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n",
-    "\n",
-    "size = 500_000\n",
-    "\n",
-    "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n",
-    "\n",
-    "# 10k subset of random samples from ds\n",
-    "fianl_ds = list(ds.take(size))\n",
-    "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 335,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'__index_level_0__': 46164,\n",
-      " 'bucket': None,\n",
-      " 'pull_request.code_review_events': None,\n",
-      " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n",
-      "                        '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n",
-      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
-      "                        'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n",
-      "                        '158894695, \"public\": true, \"created_at\": '\n",
-      "                        '\"2020-11-23T05:58:40Z\", \"org.id\": null, \"org.login\": '\n",
-      "                        'null, \"pull_request.id\": 525472638, '\n",
-      "                        '\"pull_request.number\": 2460, \"pull_request.state\": '\n",
-      "                        '\"open\", \"pull_request.title\": \"[pull] master from '\n",
-      "                        'kubernetes:master\", \"pull_request.body\": \"See Commits '\n",
-      "                        'and Changes for more details.\\\\n\\\\n-----\\\\nCreated by '\n",
-      "                        '[<img src=\\\\\"https://prod.download/pull-18h-svg\\\\\" '\n",
-      "                        'valign=\\\\\"bottom\\\\\"/> '\n",
-      "                        '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n",
-      "                        'you help keep this open source service alive? '\n",
-      "                        '**[\\\\ud83d\\\\udc96 Please sponsor : '\n",
-      "                        ')](https://prod.download/pull-pr-sponsor)**_\", '\n",
-      "                        '\"pull_request.user.login\": \"pull[bot]\", '\n",
-      "                        '\"pull_request.user.id\": 39814207, '\n",
-      "                        '\"pull_request.author_association\": \"NONE\", '\n",
-      "                        '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n",
-      "                        '\"pull_request.updated_at\": \"2020-11-23T05:58:39Z\", '\n",
-      "                        '\"pull_request.closed_at\": null, '\n",
-      "                        '\"pull_request.merged_at\": null, '\n",
-      "                        '\"pull_request.merge_commit_sha\": null, '\n",
-      "                        '\"pull_request.locked\": false, '\n",
-      "                        '\"pull_request.assignee.login\": null, '\n",
-      "                        '\"pull_request.assignee.id\": null, '\n",
-      "                        '\"pull_request.assignee.type\": null, '\n",
-      "                        '\"pull_request.assignee.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.id\": null, '\n",
-      "                        '\"pull_request.milestone.number\": null, '\n",
-      "                        '\"pull_request.milestone.title\": null, '\n",
-      "                        '\"pull_request.milestone.description\": null, '\n",
-      "                        '\"pull_request.milestone.creator.login\": null, '\n",
-      "                        '\"pull_request.milestone.creator.id\": null, '\n",
-      "                        '\"pull_request.milestone.creator.type\": null, '\n",
-      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.open_issues\": null, '\n",
-      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
-      "                        '\"pull_request.milestone.state\": null, '\n",
-      "                        '\"pull_request.milestone.created_at\": null, '\n",
-      "                        '\"pull_request.milestone.updated_at\": null, '\n",
-      "                        '\"pull_request.milestone.due_on\": null, '\n",
-      "                        '\"pull_request.milestone.closed_at\": null, '\n",
-      "                        '\"pull_request.merged\": false, '\n",
-      "                        '\"pull_request.mergeable\": null, '\n",
-      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
-      "                        '\"pull_request.merged_by.login\": null, '\n",
-      "                        '\"pull_request.merged_by.id\": null, '\n",
-      "                        '\"pull_request.merged_by.type\": null, '\n",
-      "                        '\"pull_request.merged_by.site_admin\": null, '\n",
-      "                        '\"pull_request.comments\": 0, '\n",
-      "                        '\"pull_request.review_comments\": 0, '\n",
-      "                        '\"pull_request.commits\": 4, \"pull_request.additions\": '\n",
-      "                        '243, \"pull_request.deletions\": 0, '\n",
-      "                        '\"pull_request.changed_files\": 2, '\n",
-      "                        '\"pull_request.label.id\": null, '\n",
-      "                        '\"pull_request.label.name\": null, '\n",
-      "                        '\"pull_request.label.color\": null, '\n",
-      "                        '\"pull_request.label.default\": null, '\n",
-      "                        '\"pull_request.head.label\": \"kubernetes:master\", '\n",
-      "                        '\"pull_request.head.ref\": \"master\", '\n",
-      "                        '\"pull_request.head.sha\": '\n",
-      "                        '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n",
-      "                        '\"pull_request.head.user.login\": \"kubernetes\", '\n",
-      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.name\": \"website\", '\n",
-      "                        '\"pull_request.head.repo.full_name\": '\n",
-      "                        '\"kubernetes/website\", '\n",
-      "                        '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n",
-      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.private\": false, '\n",
-      "                        '\"pull_request.head.repo.homepage\": '\n",
-      "                        '\"https://kubernetes.io\", '\n",
-      "                        '\"pull_request.head.repo.description\": \"Kubernetes '\n",
-      "                        'website and documentation repo: \", '\n",
-      "                        '\"pull_request.head.repo.fork\": false, '\n",
-      "                        '\"pull_request.head.repo.created_at\": '\n",
-      "                        '\"2016-02-10T22:46:48Z\", '\n",
-      "                        '\"pull_request.head.repo.updated_at\": '\n",
-      "                        '\"2020-11-23T02:09:41Z\", '\n",
-      "                        '\"pull_request.head.repo.pushed_at\": '\n",
-      "                        '\"2020-11-23T05:12:37Z\", '\n",
-      "                        '\"pull_request.head.repo.size\": 319781, '\n",
-      "                        '\"pull_request.head.repo.stargazers_count\": 2267, '\n",
-      "                        '\"pull_request.head.repo.watchers_count\": 2267, '\n",
-      "                        '\"pull_request.head.repo.language\": \"HTML\", '\n",
-      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.head.repo.forks_count\": 8508, '\n",
-      "                        '\"pull_request.head.repo.archived\": false, '\n",
-      "                        '\"pull_request.head.repo.disabled\": false, '\n",
-      "                        '\"pull_request.head.repo.open_issues_count\": 641, '\n",
-      "                        '\"pull_request.head.repo.forks\": 8508, '\n",
-      "                        '\"pull_request.head.repo.open_issues\": 641, '\n",
-      "                        '\"pull_request.head.repo.watchers\": 2267, '\n",
-      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n",
-      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
-      "                        '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n",
-      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
-      "                        '\"pull_request.base.label\": \"kofj:master\", '\n",
-      "                        '\"pull_request.base.ref\": \"master\", '\n",
-      "                        '\"pull_request.base.sha\": '\n",
-      "                        '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n",
-      "                        '\"pull_request.base.user.login\": \"kofj\", '\n",
-      "                        '\"pull_request.base.user.type\": \"User\", '\n",
-      "                        '\"pull_request.base.repo.name\": \"website\", '\n",
-      "                        '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n",
-      "                        '\"pull_request.base.repo.owner.login\": \"kofj\", '\n",
-      "                        '\"pull_request.base.repo.owner.type\": \"User\", '\n",
-      "                        '\"pull_request.base.repo.private\": false, '\n",
-      "                        '\"pull_request.base.repo.homepage\": '\n",
-      "                        '\"https://kubernetes.io\", '\n",
-      "                        '\"pull_request.base.repo.description\": \"Kubernetes '\n",
-      "                        'website and documentation repo: \", '\n",
-      "                        '\"pull_request.base.repo.fork\": true, '\n",
-      "                        '\"pull_request.base.repo.created_at\": '\n",
-      "                        '\"2018-11-24T02:12:25Z\", '\n",
-      "                        '\"pull_request.base.repo.updated_at\": '\n",
-      "                        '\"2020-11-23T01:58:46Z\", '\n",
-      "                        '\"pull_request.base.repo.pushed_at\": '\n",
-      "                        '\"2020-11-23T01:58:43Z\", '\n",
-      "                        '\"pull_request.base.repo.size\": 286251, '\n",
-      "                        '\"pull_request.base.repo.stargazers_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.watchers_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.language\": \"HTML\", '\n",
-      "                        '\"pull_request.base.repo.has_issues\": false, '\n",
-      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.base.repo.forks_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.archived\": false, '\n",
-      "                        '\"pull_request.base.repo.disabled\": false, '\n",
-      "                        '\"pull_request.base.repo.open_issues_count\": 1, '\n",
-      "                        '\"pull_request.base.repo.forks\": 0, '\n",
-      "                        '\"pull_request.base.repo.open_issues\": 1, '\n",
-      "                        '\"pull_request.base.repo.watchers\": 0, '\n",
-      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n",
-      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
-      "                        '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n",
-      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
-      "                        '\"pull_request.guid\": \"kofj/website/pull/2460\"}, '\n",
-      "                        '{\"type\": \"PullRequestEvent\", \"action\": \"closed\", '\n",
-      "                        '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n",
-      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
-      "                        'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n",
-      "                        '158894695, \"public\": true, \"created_at\": '\n",
-      "                        '\"2020-11-23T05:58:50Z\", \"org.id\": null, \"org.login\": '\n",
-      "                        'null, \"pull_request.id\": 525472638, '\n",
-      "                        '\"pull_request.number\": 2460, \"pull_request.state\": '\n",
-      "                        '\"closed\", \"pull_request.title\": \"[pull] master from '\n",
-      "                        'kubernetes:master\", \"pull_request.body\": \"See '\n",
-      "                        '[Commits](/kofj/website/pull/2460/commits) and '\n",
-      "                        '[Changes](/kofj/website/pull/2460/files) for more '\n",
-      "                        'details.\\\\n\\\\n-----\\\\nCreated by [<img '\n",
-      "                        'src=\\\\\"https://prod.download/pull-18h-svg\\\\\" '\n",
-      "                        'valign=\\\\\"bottom\\\\\"/> '\n",
-      "                        '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n",
-      "                        'you help keep this open source service alive? '\n",
-      "                        '**[\\\\ud83d\\\\udc96 Please sponsor : '\n",
-      "                        ')](https://prod.download/pull-pr-sponsor)**_\", '\n",
-      "                        '\"pull_request.user.login\": \"pull[bot]\", '\n",
-      "                        '\"pull_request.user.id\": 39814207, '\n",
-      "                        '\"pull_request.author_association\": \"NONE\", '\n",
-      "                        '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n",
-      "                        '\"pull_request.updated_at\": \"2020-11-23T05:58:50Z\", '\n",
-      "                        '\"pull_request.closed_at\": \"2020-11-23T05:58:50Z\", '\n",
-      "                        '\"pull_request.merged_at\": \"2020-11-23T05:58:49Z\", '\n",
-      "                        '\"pull_request.merge_commit_sha\": '\n",
-      "                        '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n",
-      "                        '\"pull_request.locked\": false, '\n",
-      "                        '\"pull_request.assignee.login\": null, '\n",
-      "                        '\"pull_request.assignee.id\": null, '\n",
-      "                        '\"pull_request.assignee.type\": null, '\n",
-      "                        '\"pull_request.assignee.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.id\": null, '\n",
-      "                        '\"pull_request.milestone.number\": null, '\n",
-      "                        '\"pull_request.milestone.title\": null, '\n",
-      "                        '\"pull_request.milestone.description\": null, '\n",
-      "                        '\"pull_request.milestone.creator.login\": null, '\n",
-      "                        '\"pull_request.milestone.creator.id\": null, '\n",
-      "                        '\"pull_request.milestone.creator.type\": null, '\n",
-      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.open_issues\": null, '\n",
-      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
-      "                        '\"pull_request.milestone.state\": null, '\n",
-      "                        '\"pull_request.milestone.created_at\": null, '\n",
-      "                        '\"pull_request.milestone.updated_at\": null, '\n",
-      "                        '\"pull_request.milestone.due_on\": null, '\n",
-      "                        '\"pull_request.milestone.closed_at\": null, '\n",
-      "                        '\"pull_request.merged\": true, '\n",
-      "                        '\"pull_request.mergeable\": null, '\n",
-      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
-      "                        '\"pull_request.merged_by.login\": \"pull[bot]\", '\n",
-      "                        '\"pull_request.merged_by.id\": 39814207, '\n",
-      "                        '\"pull_request.merged_by.type\": \"Bot\", '\n",
-      "                        '\"pull_request.merged_by.site_admin\": false, '\n",
-      "                        '\"pull_request.comments\": 0, '\n",
-      "                        '\"pull_request.review_comments\": 0, '\n",
-      "                        '\"pull_request.commits\": 4, \"pull_request.additions\": '\n",
-      "                        '243, \"pull_request.deletions\": 0, '\n",
-      "                        '\"pull_request.changed_files\": 2, '\n",
-      "                        '\"pull_request.label.id\": null, '\n",
-      "                        '\"pull_request.label.name\": null, '\n",
-      "                        '\"pull_request.label.color\": null, '\n",
-      "                        '\"pull_request.label.default\": null, '\n",
-      "                        '\"pull_request.head.label\": \"kubernetes:master\", '\n",
-      "                        '\"pull_request.head.ref\": \"master\", '\n",
-      "                        '\"pull_request.head.sha\": '\n",
-      "                        '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n",
-      "                        '\"pull_request.head.user.login\": \"kubernetes\", '\n",
-      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.name\": \"website\", '\n",
-      "                        '\"pull_request.head.repo.full_name\": '\n",
-      "                        '\"kubernetes/website\", '\n",
-      "                        '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n",
-      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.private\": false, '\n",
-      "                        '\"pull_request.head.repo.homepage\": '\n",
-      "                        '\"https://kubernetes.io\", '\n",
-      "                        '\"pull_request.head.repo.description\": \"Kubernetes '\n",
-      "                        'website and documentation repo: \", '\n",
-      "                        '\"pull_request.head.repo.fork\": false, '\n",
-      "                        '\"pull_request.head.repo.created_at\": '\n",
-      "                        '\"2016-02-10T22:46:48Z\", '\n",
-      "                        '\"pull_request.head.repo.updated_at\": '\n",
-      "                        '\"2020-11-23T02:09:41Z\", '\n",
-      "                        '\"pull_request.head.repo.pushed_at\": '\n",
-      "                        '\"2020-11-23T05:12:37Z\", '\n",
-      "                        '\"pull_request.head.repo.size\": 319781, '\n",
-      "                        '\"pull_request.head.repo.stargazers_count\": 2267, '\n",
-      "                        '\"pull_request.head.repo.watchers_count\": 2267, '\n",
-      "                        '\"pull_request.head.repo.language\": \"HTML\", '\n",
-      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.head.repo.forks_count\": 8508, '\n",
-      "                        '\"pull_request.head.repo.archived\": false, '\n",
-      "                        '\"pull_request.head.repo.disabled\": false, '\n",
-      "                        '\"pull_request.head.repo.open_issues_count\": 641, '\n",
-      "                        '\"pull_request.head.repo.forks\": 8508, '\n",
-      "                        '\"pull_request.head.repo.open_issues\": 641, '\n",
-      "                        '\"pull_request.head.repo.watchers\": 2267, '\n",
-      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n",
-      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
-      "                        '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n",
-      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
-      "                        '\"pull_request.base.label\": \"kofj:master\", '\n",
-      "                        '\"pull_request.base.ref\": \"master\", '\n",
-      "                        '\"pull_request.base.sha\": '\n",
-      "                        '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n",
-      "                        '\"pull_request.base.user.login\": \"kofj\", '\n",
-      "                        '\"pull_request.base.user.type\": \"User\", '\n",
-      "                        '\"pull_request.base.repo.name\": \"website\", '\n",
-      "                        '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n",
-      "                        '\"pull_request.base.repo.owner.login\": \"kofj\", '\n",
-      "                        '\"pull_request.base.repo.owner.type\": \"User\", '\n",
-      "                        '\"pull_request.base.repo.private\": false, '\n",
-      "                        '\"pull_request.base.repo.homepage\": '\n",
-      "                        '\"https://kubernetes.io\", '\n",
-      "                        '\"pull_request.base.repo.description\": \"Kubernetes '\n",
-      "                        'website and documentation repo: \", '\n",
-      "                        '\"pull_request.base.repo.fork\": true, '\n",
-      "                        '\"pull_request.base.repo.created_at\": '\n",
-      "                        '\"2018-11-24T02:12:25Z\", '\n",
-      "                        '\"pull_request.base.repo.updated_at\": '\n",
-      "                        '\"2020-11-23T01:58:46Z\", '\n",
-      "                        '\"pull_request.base.repo.pushed_at\": '\n",
-      "                        '\"2020-11-23T05:58:46Z\", '\n",
-      "                        '\"pull_request.base.repo.size\": 286251, '\n",
-      "                        '\"pull_request.base.repo.stargazers_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.watchers_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.language\": \"HTML\", '\n",
-      "                        '\"pull_request.base.repo.has_issues\": false, '\n",
-      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.base.repo.forks_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.archived\": false, '\n",
-      "                        '\"pull_request.base.repo.disabled\": false, '\n",
-      "                        '\"pull_request.base.repo.open_issues_count\": 0, '\n",
-      "                        '\"pull_request.base.repo.forks\": 0, '\n",
-      "                        '\"pull_request.base.repo.open_issues\": 0, '\n",
-      "                        '\"pull_request.base.repo.watchers\": 0, '\n",
-      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n",
-      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
-      "                        '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n",
-      "                        '\"Creative Commons Attribution 4.0 International\", '\n",
-      "                        '\"pull_request.guid\": \"kofj/website/pull/2460\"}]',\n",
-      " 'pull_request.guid': 'kofj/website/pull/2460',\n",
-      " 'pull_request.issue_events': None}\n"
-     ]
-    }
-   ],
-   "source": [
-    "from pprint import pprint\n",
-    "\n",
-    "pprint(ds[0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 412,
+   "execution_count": 444,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -625,16 +248,15 @@
     "    try:\n",
     "        base_data = events[0] if events else reviews[0]\n",
     "    except IndexError:\n",
+    "        # init empty dict\n",
+    "        base_data = {}\n",
     "        if issues:\n",
-    "            base_data = issues_events[0]\n",
-    "            first_event = issues[0][\"events\"][0]\n",
-    "            base_data['pull_request.title'] = first_event[\"title\"]\n",
-    "            print(f'base data keys: {base_data.keys()}')\n",
-    "            base_data[\"repo.name\"] = base_data[\"repo\"]\n",
-    "            base_data[\"org.id\"] = base_data[\"org\"]\n",
-    "            base_data[\"repo.name\"] = base_data[\"repo\"]\n",
-    "            base_data[\"pull_request.number\"] = int(base_data[\"pull_request\"][\"number\"])\n",
-    "            base_data[\"pull_request.user.login\"] = base_data[\"pull_request\"][\"user_login\"]\n",
+    "            base_data = {}\n",
+    "            first_event = issues[0]\n",
+    "            base_data['pull_request.title'] = first_event[\"events\"][0][\"title\"]\n",
+    "            base_data[\"repo.name\"] = first_event[\"repo\"]\n",
+    "            base_data[\"pull_request.number\"] = first_event[\"pull_request\"][\"number\"]\n",
+    "            base_data[\"pull_request.user.login\"] = first_event[\"pull_request\"][\"user_login\"]\n",
     "            print(\"filling PR data from issue event\")\n",
     "        else:\n",
     "            raise IndexError(\"No events for PR\")\n",
@@ -660,4356 +282,40 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 413,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'repo'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[412], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m     base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n",
-      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[413], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n",
-      "Cell \u001b[0;32mIn[412], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'repo'"
-     ]
-    }
-   ],
-   "source": [
-    "new_row = merge_events(row)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 411,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                    \r"
-     ]
-    },
-    {
-     "ename": "KeyError",
-     "evalue": "'repo'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[410], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m     base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n",
-      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[411], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m small_ds_2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39mselect(\u001b[39mrange\u001b[39m(\u001b[39m1000\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m dd \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events)\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    578\u001b[0m     \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m    583\u001b[0m     \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m    539\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m    540\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m    541\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m    542\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m    543\u001b[0m }\n\u001b[1;32m    544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m   3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   3080\u001b[0m     \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m   3081\u001b[0m         disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m   3082\u001b[0m         unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   3085\u001b[0m         desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   3086\u001b[0m     ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m         \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m   3088\u001b[0m             \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m   3089\u001b[0m                 shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m   3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m   3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m     example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m   3442\u001b[0m     \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m   3443\u001b[0m         \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m   3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m   3343\u001b[0m     additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m   3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m   3346\u001b[0m     processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m   3347\u001b[0m         k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m   3348\u001b[0m     }\n",
-      "Cell \u001b[0;32mIn[410], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'repo'"
-     ]
-    }
-   ],
-   "source": [
-    "small_ds_2 = ds.select(range(1000))\n",
-    "dd = small_ds_2.map(merge_events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 405,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['action', 'author', 'comment', 'comment_id', 'description', 'title', 'type', 'created_at'])"
-      ]
-     },
-     "execution_count": 405,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues_events[0].keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 366,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'pull_request.guid': 'karen-kua/COVID-19_Tracker/pull/15',\n",
-       " 'pull_request.code_review_events': None,\n",
-       " 'pull_request.events': None,\n",
-       " 'pull_request.issue_events': '{\"repo\": \"karen-kua/COVID-19_Tracker\", \"org\": null, \"issue_id\": 1018615993, \"issue_number\": 15, \"pull_request\": {\"number\": 15.0, \"repo\": \"COVID-19_Tracker\", \"user_login\": \"karen-kua\"}, \"events\": [{\"action\": \"opened\", \"author\": \"dependabot[bot]\", \"comment\": null, \"comment_id\": null, \"datetime\": \"2021-10-06T15:46:43Z\", \"description\": \"Bumps [url-parse](https://github.com/unshiftio/url-parse) from 1.4.7 to 1.5.3.\\\\n<details>\\\\n<summary>Commits</summary>\\\\n<ul>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/ad444931666a30bad11472d89a216461cf16cae2\\\\\"><code>ad44493</code></a> [dist] 1.5.3</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/c7984617e235892cc22e0f47bb5ff1c012e6e39f\\\\\"><code>c798461</code></a> [fix] Fix host parsing for file URLs (<a href=\\\\\"https://github-redirect.dependabot.com/unshiftio/url-parse/issues/210\\\\\">#210</a>)</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/201034b8670c2aa382d7ec410ee750ac6f2f9c38\\\\\"><code>201034b</code></a> [dist] 1.5.2</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/2d9ac2c94067742b2116332c1e03be9f37371dff\\\\\"><code>2d9ac2c</code></a> [fix] Sanitize only special URLs (<a href=\\\\\"https://github-redirect.dependabot.com/unshiftio/url-parse/issues/209\\\\\">#209</a>)</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/fb128af4f43fa17f351d50cf615c7598c751f50a\\\\\"><code>fb128af</code></a> [fix] Use <code>\\'null\\'</code> as <code>origin</code> for non special URLs</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/fed6d9e338ea39de2d68bb66607066d71328c62f\\\\\"><code>fed6d9e</code></a> [fix] Add a leading slash only if the URL is special</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/94872e7ab9103ee69b958959baa14c9e682a7f10\\\\\"><code>94872e7</code></a> [fix] Do not incorrectly set the <code>slashes</code> property to <code>true</code></li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/81ab967889b08112d3356e451bf03e6aa0cbb7e0\\\\\"><code>81ab967</code></a> [fix] Ignore slashes after the protocol for special URLs</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/ee22050a48a67409aa5f7c87947284156d615bd1\\\\\"><code>ee22050</code></a> [ci] Use GitHub Actions</li>\\\\n<li><a href=\\\\\"https://github.com/unshiftio/url-parse/commit/d2979b586d8c7751e0c77f127d9ce1b2143cc0c9\\\\\"><code>d2979b5</code></a> [fix] Special case the <code>file:</code> protocol (<a href=\\\\\"https://github-redirect.dependabot.com/unshiftio/url-parse/issues/204\\\\\">#204</a>)</li>\\\\n<li>Additional commits viewable in <a href=\\\\\"https://github.com/unshiftio/url-parse/compare/1.4.7...1.5.3\\\\\">compare view</a></li>\\\\n</ul>\\\\n</details>\\\\n<br />\\\\n\\\\n\\\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=url-parse&package-manager=npm_and_yarn&previous-version=1.4.7&new-version=1.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\\\n\\\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\\\n\\\\n[//]: # (dependabot-automerge-start)\\\\n[//]: # (dependabot-automerge-end)\\\\n\\\\n---\\\\n\\\\n<details>\\\\n<summary>Dependabot commands and options</summary>\\\\n<br />\\\\n\\\\nYou can trigger Dependabot actions by commenting on this PR:\\\\n- `@dependabot rebase` will rebase this PR\\\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\\\n- `@dependabot merge` will merge this PR after your CI passes on it\\\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\\\n- `@dependabot reopen` will reopen this PR if it is closed\\\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\\\n\\\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/azukimochi/COVID-19_Tracker/network/alerts).\\\\n\\\\n</details>\", \"title\": \"Bump url-parse from 1.4.7 to 1.5.3\", \"type\": \"issue\"}, {\"action\": \"created\", \"author\": \"dependabot[bot]\", \"comment\": \"Superseded by #17.\", \"comment_id\": 1045459471.0, \"datetime\": \"2022-02-19 00:53:17+00:00\", \"description\": null, \"title\": null, \"type\": \"comment\"}]}',\n",
-       " 'bucket': '940',\n",
-       " '__index_level_0__': 72946}"
-      ]
-     },
-     "execution_count": 366,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "row"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 360,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])"
-      ]
-     },
-     "execution_count": 360,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues[0].keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 361,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'number': 15.0, 'repo': 'COVID-19_Tracker', 'user_login': 'karen-kua'}"
-      ]
-     },
-     "execution_count": 361,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues[0][\"pull_request\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 351,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "small_ds_2 = ds.select(range(500))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 398,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                    \r"
-     ]
-    },
-    {
-     "ename": "KeyError",
-     "evalue": "'events'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[396], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m     base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n",
-      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[398], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m merged_ds \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events, remove_columns\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mpull_request.events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.code_review_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.issue_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m__index_level_0__\u001b[39;49m\u001b[39m'\u001b[39;49m,\u001b[39m'\u001b[39;49m\u001b[39mpull_request.guid\u001b[39;49m\u001b[39m'\u001b[39;49m])\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    578\u001b[0m     \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m    579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m    583\u001b[0m     \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m    539\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m    540\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m    541\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m    542\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m    543\u001b[0m }\n\u001b[1;32m    544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m    546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m    547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m   3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m   3080\u001b[0m     \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m   3081\u001b[0m         disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m   3082\u001b[0m         unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m   3085\u001b[0m         desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m   3086\u001b[0m     ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m         \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m   3088\u001b[0m             \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m   3089\u001b[0m                 shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m   3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m   3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m     example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m   3442\u001b[0m     \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m   3443\u001b[0m         \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n",
-      "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single.<locals>.apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m   3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m   3343\u001b[0m     additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m   3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m   3346\u001b[0m     processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m   3347\u001b[0m         k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m   3348\u001b[0m     }\n",
-      "Cell \u001b[0;32mIn[396], line 170\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n\u001b[1;32m    169\u001b[0m     base_data \u001b[39m=\u001b[39m issues_events[\u001b[39m0\u001b[39m]\n\u001b[0;32m--> 170\u001b[0m     first_event \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mevents\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m0\u001b[39m]\n\u001b[1;32m    171\u001b[0m     base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m    172\u001b[0m     base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'events'"
-     ]
-    }
-   ],
-   "source": [
-    "merged_ds = small_ds_2.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 150,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 31.42ba/s]\n",
-      "Upload 1 LFS files: 100%|██████████| 1/1 [00:10<00:00, 10.30s/it]\n",
-      "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:11<00:00, 11.45s/it]\n"
-     ]
-    }
-   ],
-   "source": [
-    "merged_ds.push_to_hub(\"loubnabnl/code_reviews_3\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading readme: 100%|██████████| 5.88k/5.88k [00:00<00:00, 3.76MB/s]\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Downloading and preparing dataset None/None to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading data: 100%|██████████| 16.1M/16.1M [00:00<00:00, 17.4MB/s]\n",
-      "Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it]\n",
-      "Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 676.50it/s]\n",
-      "                                                                                       \r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset parquet downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "Dataset({\n",
-       "    features: ['bucket', 'pull_request_info', 'head_repo_info', 'base_repo_info', 'events'],\n",
-       "    num_rows: 10000\n",
-       "})"
-      ]
-     },
-     "execution_count": 72,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n",
-    "ds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'__index_level_0__': 1028,\n",
-      " 'bucket': None,\n",
-      " 'pull_request.code_review_events': None,\n",
-      " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n",
-      "                        '\"actor.login\": \"M-Davies\", \"actor.id\": 25231953, '\n",
-      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
-      "                        'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n",
-      "                        '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n",
-      "                        '\"2020-05-28T09:45:30Z\", \"org.id\": 1673867, '\n",
-      "                        '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n",
-      "                        '424372800, \"pull_request.number\": 1787, '\n",
-      "                        '\"pull_request.state\": \"open\", \"pull_request.title\": '\n",
-      "                        '\"Revert \\'Fire installer failure on all failed '\n",
-      "                        'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n",
-      "                        'lot of false positives or just doesnt work overall. '\n",
-      "                        'Better to just remove for '\n",
-      "                        'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n",
-      "                        '<morgan.davies@ibm.com>\", \"pull_request.user.login\": '\n",
-      "                        '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n",
-      "                        '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n",
-      "                        '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n",
-      "                        '\"pull_request.updated_at\": \"2020-05-28T09:45:30Z\", '\n",
-      "                        '\"pull_request.closed_at\": null, '\n",
-      "                        '\"pull_request.merged_at\": null, '\n",
-      "                        '\"pull_request.merge_commit_sha\": null, '\n",
-      "                        '\"pull_request.locked\": false, '\n",
-      "                        '\"pull_request.assignee.login\": null, '\n",
-      "                        '\"pull_request.assignee.id\": null, '\n",
-      "                        '\"pull_request.assignee.type\": null, '\n",
-      "                        '\"pull_request.assignee.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.id\": null, '\n",
-      "                        '\"pull_request.milestone.number\": null, '\n",
-      "                        '\"pull_request.milestone.title\": null, '\n",
-      "                        '\"pull_request.milestone.description\": null, '\n",
-      "                        '\"pull_request.milestone.creator.login\": null, '\n",
-      "                        '\"pull_request.milestone.creator.id\": null, '\n",
-      "                        '\"pull_request.milestone.creator.type\": null, '\n",
-      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.open_issues\": null, '\n",
-      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
-      "                        '\"pull_request.milestone.state\": null, '\n",
-      "                        '\"pull_request.milestone.created_at\": null, '\n",
-      "                        '\"pull_request.milestone.updated_at\": null, '\n",
-      "                        '\"pull_request.milestone.due_on\": null, '\n",
-      "                        '\"pull_request.milestone.closed_at\": null, '\n",
-      "                        '\"pull_request.merged\": false, '\n",
-      "                        '\"pull_request.mergeable\": null, '\n",
-      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
-      "                        '\"pull_request.merged_by.login\": null, '\n",
-      "                        '\"pull_request.merged_by.id\": null, '\n",
-      "                        '\"pull_request.merged_by.type\": null, '\n",
-      "                        '\"pull_request.merged_by.site_admin\": null, '\n",
-      "                        '\"pull_request.comments\": 0, '\n",
-      "                        '\"pull_request.review_comments\": 0, '\n",
-      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
-      "                        '4, \"pull_request.deletions\": 6, '\n",
-      "                        '\"pull_request.changed_files\": 1, '\n",
-      "                        '\"pull_request.label.id\": null, '\n",
-      "                        '\"pull_request.label.name\": null, '\n",
-      "                        '\"pull_request.label.color\": null, '\n",
-      "                        '\"pull_request.label.default\": null, '\n",
-      "                        '\"pull_request.head.label\": \"M-Davies:revert\", '\n",
-      "                        '\"pull_request.head.ref\": \"revert\", '\n",
-      "                        '\"pull_request.head.sha\": '\n",
-      "                        '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n",
-      "                        '\"pull_request.head.user.login\": \"M-Davies\", '\n",
-      "                        '\"pull_request.head.user.type\": \"User\", '\n",
-      "                        '\"pull_request.head.repo.name\": \"openjdk-build\", '\n",
-      "                        '\"pull_request.head.repo.full_name\": '\n",
-      "                        '\"M-Davies/openjdk-build\", '\n",
-      "                        '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n",
-      "                        '\"pull_request.head.repo.owner.type\": \"User\", '\n",
-      "                        '\"pull_request.head.repo.private\": false, '\n",
-      "                        '\"pull_request.head.repo.homepage\": \"\", '\n",
-      "                        '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n",
-      "                        'community OpenJDK build scripts - common across all '\n",
-      "                        'releases/versions\", \"pull_request.head.repo.fork\": '\n",
-      "                        'true, \"pull_request.head.repo.created_at\": '\n",
-      "                        '\"2019-11-29T09:24:43Z\", '\n",
-      "                        '\"pull_request.head.repo.updated_at\": '\n",
-      "                        '\"2020-05-27T14:45:16Z\", '\n",
-      "                        '\"pull_request.head.repo.pushed_at\": '\n",
-      "                        '\"2020-05-27T14:45:13Z\", '\n",
-      "                        '\"pull_request.head.repo.size\": 2383, '\n",
-      "                        '\"pull_request.head.repo.stargazers_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.watchers_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.language\": \"Shell\", '\n",
-      "                        '\"pull_request.head.repo.has_issues\": false, '\n",
-      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.head.repo.forks_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.archived\": false, '\n",
-      "                        '\"pull_request.head.repo.disabled\": false, '\n",
-      "                        '\"pull_request.head.repo.open_issues_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.forks\": 0, '\n",
-      "                        '\"pull_request.head.repo.open_issues\": 0, '\n",
-      "                        '\"pull_request.head.repo.watchers\": 0, '\n",
-      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n",
-      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
-      "                        '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n",
-      "                        '\"Apache License 2.0\", \"pull_request.base.label\": '\n",
-      "                        '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n",
-      "                        '\"master\", \"pull_request.base.sha\": '\n",
-      "                        '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n",
-      "                        '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n",
-      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.name\": \"openjdk-build\", '\n",
-      "                        '\"pull_request.base.repo.full_name\": '\n",
-      "                        '\"AdoptOpenJDK/openjdk-build\", '\n",
-      "                        '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n",
-      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.private\": false, '\n",
-      "                        '\"pull_request.base.repo.homepage\": \"\", '\n",
-      "                        '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n",
-      "                        'community OpenJDK build scripts - common across all '\n",
-      "                        'releases/versions\", \"pull_request.base.repo.fork\": '\n",
-      "                        'false, \"pull_request.base.repo.created_at\": '\n",
-      "                        '\"2017-03-17T09:31:50Z\", '\n",
-      "                        '\"pull_request.base.repo.updated_at\": '\n",
-      "                        '\"2020-05-28T07:45:12Z\", '\n",
-      "                        '\"pull_request.base.repo.pushed_at\": '\n",
-      "                        '\"2020-05-27T14:18:11Z\", '\n",
-      "                        '\"pull_request.base.repo.size\": 2234, '\n",
-      "                        '\"pull_request.base.repo.stargazers_count\": 620, '\n",
-      "                        '\"pull_request.base.repo.watchers_count\": 620, '\n",
-      "                        '\"pull_request.base.repo.language\": \"Shell\", '\n",
-      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.base.repo.forks_count\": 137, '\n",
-      "                        '\"pull_request.base.repo.archived\": false, '\n",
-      "                        '\"pull_request.base.repo.disabled\": false, '\n",
-      "                        '\"pull_request.base.repo.open_issues_count\": 166, '\n",
-      "                        '\"pull_request.base.repo.forks\": 137, '\n",
-      "                        '\"pull_request.base.repo.open_issues\": 166, '\n",
-      "                        '\"pull_request.base.repo.watchers\": 620, '\n",
-      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n",
-      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
-      "                        '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n",
-      "                        '\"Apache License 2.0\", \"pull_request.guid\": '\n",
-      "                        '\"AdoptOpenJDK/openjdk-build/pull/1787\"}, {\"type\": '\n",
-      "                        '\"PullRequestEvent\", \"action\": \"closed\", '\n",
-      "                        '\"actor.login\": \"sxa\", \"actor.id\": 6487691, '\n",
-      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
-      "                        'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n",
-      "                        '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n",
-      "                        '\"2020-05-28T09:51:49Z\", \"org.id\": 1673867, '\n",
-      "                        '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n",
-      "                        '424372800, \"pull_request.number\": 1787, '\n",
-      "                        '\"pull_request.state\": \"closed\", \"pull_request.title\": '\n",
-      "                        '\"Revert \\'Fire installer failure on all failed '\n",
-      "                        'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n",
-      "                        'lot of false positives or just doesnt work overall. '\n",
-      "                        'Better to just remove for '\n",
-      "                        'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n",
-      "                        '<morgan.davies@ibm.com>\", \"pull_request.user.login\": '\n",
-      "                        '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n",
-      "                        '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n",
-      "                        '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n",
-      "                        '\"pull_request.updated_at\": \"2020-05-28T09:51:48Z\", '\n",
-      "                        '\"pull_request.closed_at\": \"2020-05-28T09:51:48Z\", '\n",
-      "                        '\"pull_request.merged_at\": \"2020-05-28T09:51:48Z\", '\n",
-      "                        '\"pull_request.merge_commit_sha\": '\n",
-      "                        '\"4c3495c6f008459ca1c276477c5f968e9dcd7c6b\", '\n",
-      "                        '\"pull_request.locked\": false, '\n",
-      "                        '\"pull_request.assignee.login\": null, '\n",
-      "                        '\"pull_request.assignee.id\": null, '\n",
-      "                        '\"pull_request.assignee.type\": null, '\n",
-      "                        '\"pull_request.assignee.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.id\": null, '\n",
-      "                        '\"pull_request.milestone.number\": null, '\n",
-      "                        '\"pull_request.milestone.title\": null, '\n",
-      "                        '\"pull_request.milestone.description\": null, '\n",
-      "                        '\"pull_request.milestone.creator.login\": null, '\n",
-      "                        '\"pull_request.milestone.creator.id\": null, '\n",
-      "                        '\"pull_request.milestone.creator.type\": null, '\n",
-      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.open_issues\": null, '\n",
-      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
-      "                        '\"pull_request.milestone.state\": null, '\n",
-      "                        '\"pull_request.milestone.created_at\": null, '\n",
-      "                        '\"pull_request.milestone.updated_at\": null, '\n",
-      "                        '\"pull_request.milestone.due_on\": null, '\n",
-      "                        '\"pull_request.milestone.closed_at\": null, '\n",
-      "                        '\"pull_request.merged\": true, '\n",
-      "                        '\"pull_request.mergeable\": null, '\n",
-      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
-      "                        '\"pull_request.merged_by.login\": \"sxa\", '\n",
-      "                        '\"pull_request.merged_by.id\": 6487691, '\n",
-      "                        '\"pull_request.merged_by.type\": \"User\", '\n",
-      "                        '\"pull_request.merged_by.site_admin\": false, '\n",
-      "                        '\"pull_request.comments\": 0, '\n",
-      "                        '\"pull_request.review_comments\": 0, '\n",
-      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
-      "                        '4, \"pull_request.deletions\": 6, '\n",
-      "                        '\"pull_request.changed_files\": 1, '\n",
-      "                        '\"pull_request.label.id\": null, '\n",
-      "                        '\"pull_request.label.name\": null, '\n",
-      "                        '\"pull_request.label.color\": null, '\n",
-      "                        '\"pull_request.label.default\": null, '\n",
-      "                        '\"pull_request.head.label\": \"M-Davies:revert\", '\n",
-      "                        '\"pull_request.head.ref\": \"revert\", '\n",
-      "                        '\"pull_request.head.sha\": '\n",
-      "                        '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n",
-      "                        '\"pull_request.head.user.login\": \"M-Davies\", '\n",
-      "                        '\"pull_request.head.user.type\": \"User\", '\n",
-      "                        '\"pull_request.head.repo.name\": \"openjdk-build\", '\n",
-      "                        '\"pull_request.head.repo.full_name\": '\n",
-      "                        '\"M-Davies/openjdk-build\", '\n",
-      "                        '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n",
-      "                        '\"pull_request.head.repo.owner.type\": \"User\", '\n",
-      "                        '\"pull_request.head.repo.private\": false, '\n",
-      "                        '\"pull_request.head.repo.homepage\": \"\", '\n",
-      "                        '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n",
-      "                        'community OpenJDK build scripts - common across all '\n",
-      "                        'releases/versions\", \"pull_request.head.repo.fork\": '\n",
-      "                        'true, \"pull_request.head.repo.created_at\": '\n",
-      "                        '\"2019-11-29T09:24:43Z\", '\n",
-      "                        '\"pull_request.head.repo.updated_at\": '\n",
-      "                        '\"2020-05-27T14:45:16Z\", '\n",
-      "                        '\"pull_request.head.repo.pushed_at\": '\n",
-      "                        '\"2020-05-28T09:46:04Z\", '\n",
-      "                        '\"pull_request.head.repo.size\": 2383, '\n",
-      "                        '\"pull_request.head.repo.stargazers_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.watchers_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.language\": \"Shell\", '\n",
-      "                        '\"pull_request.head.repo.has_issues\": false, '\n",
-      "                        '\"pull_request.head.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.head.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.head.repo.forks_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.archived\": false, '\n",
-      "                        '\"pull_request.head.repo.disabled\": false, '\n",
-      "                        '\"pull_request.head.repo.open_issues_count\": 0, '\n",
-      "                        '\"pull_request.head.repo.forks\": 0, '\n",
-      "                        '\"pull_request.head.repo.open_issues\": 0, '\n",
-      "                        '\"pull_request.head.repo.watchers\": 0, '\n",
-      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n",
-      "                        '\"pull_request.head.repo.license.spdx_id\": '\n",
-      "                        '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n",
-      "                        '\"Apache License 2.0\", \"pull_request.base.label\": '\n",
-      "                        '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n",
-      "                        '\"master\", \"pull_request.base.sha\": '\n",
-      "                        '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n",
-      "                        '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n",
-      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.name\": \"openjdk-build\", '\n",
-      "                        '\"pull_request.base.repo.full_name\": '\n",
-      "                        '\"AdoptOpenJDK/openjdk-build\", '\n",
-      "                        '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n",
-      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.private\": false, '\n",
-      "                        '\"pull_request.base.repo.homepage\": \"\", '\n",
-      "                        '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n",
-      "                        'community OpenJDK build scripts - common across all '\n",
-      "                        'releases/versions\", \"pull_request.base.repo.fork\": '\n",
-      "                        'false, \"pull_request.base.repo.created_at\": '\n",
-      "                        '\"2017-03-17T09:31:50Z\", '\n",
-      "                        '\"pull_request.base.repo.updated_at\": '\n",
-      "                        '\"2020-05-28T07:45:12Z\", '\n",
-      "                        '\"pull_request.base.repo.pushed_at\": '\n",
-      "                        '\"2020-05-28T09:51:48Z\", '\n",
-      "                        '\"pull_request.base.repo.size\": 2234, '\n",
-      "                        '\"pull_request.base.repo.stargazers_count\": 620, '\n",
-      "                        '\"pull_request.base.repo.watchers_count\": 620, '\n",
-      "                        '\"pull_request.base.repo.language\": \"Shell\", '\n",
-      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.base.repo.has_projects\": true, '\n",
-      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.base.repo.has_wiki\": true, '\n",
-      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.base.repo.forks_count\": 137, '\n",
-      "                        '\"pull_request.base.repo.archived\": false, '\n",
-      "                        '\"pull_request.base.repo.disabled\": false, '\n",
-      "                        '\"pull_request.base.repo.open_issues_count\": 165, '\n",
-      "                        '\"pull_request.base.repo.forks\": 137, '\n",
-      "                        '\"pull_request.base.repo.open_issues\": 165, '\n",
-      "                        '\"pull_request.base.repo.watchers\": 620, '\n",
-      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n",
-      "                        '\"pull_request.base.repo.license.spdx_id\": '\n",
-      "                        '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n",
-      "                        '\"Apache License 2.0\", \"pull_request.guid\": '\n",
-      "                        '\"AdoptOpenJDK/openjdk-build/pull/1787\"}]',\n",
-      " 'pull_request.guid': 'AdoptOpenJDK/openjdk-build/pull/1787',\n",
-      " 'pull_request.issue_events': None}\n"
-     ]
-    }
-   ],
-   "source": [
-    "from pprint import pprint\n",
-    "pprint(small_ds[50])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 151,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'action': 'opened',\n",
-      "  'actor.id': 25231953,\n",
-      "  'actor.login': 'M-Davies',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2020, 5, 28, 9, 45, 30, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': False,\n",
-      "  'pull_request.merged_by.login': None,\n",
-      "  'pull_request.merged_by.type': None,\n",
-      "  'pull_request.state': 'open',\n",
-      "  'review.author_association': None,\n",
-      "  'review.body': None,\n",
-      "  'review.commit_id': None,\n",
-      "  'review.id': None,\n",
-      "  'review.state': None,\n",
-      "  'review.submitted_at': None,\n",
-      "  'type': 'PullRequestEvent',\n",
-      "  'user.login': None,\n",
-      "  'user.type': None},\n",
-      " {'action': 'closed',\n",
-      "  'actor.id': 6487691,\n",
-      "  'actor.login': 'sxa',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2020, 5, 28, 9, 51, 49, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': True,\n",
-      "  'pull_request.merged_by.login': 'sxa',\n",
-      "  'pull_request.merged_by.type': 'User',\n",
-      "  'pull_request.state': 'closed',\n",
-      "  'review.author_association': None,\n",
-      "  'review.body': None,\n",
-      "  'review.commit_id': None,\n",
-      "  'review.id': None,\n",
-      "  'review.state': None,\n",
-      "  'review.submitted_at': None,\n",
-      "  'type': 'PullRequestEvent',\n",
-      "  'user.login': None,\n",
-      "  'user.type': None}]\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(merged_ds[50][\"events\"])"
+    "merged_ds = ds.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 222,
+   "execution_count": 449,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Found cached dataset parquet (/Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--code_reviews_3-c3e4ac735edf14b4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n"
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.20ba/s]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.10s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 33.12ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00,  9.55s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 39.47ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00,  9.99s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.45ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:23<00:00, 23.74s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 34.84ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.48s/it]\n",
+      "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:03<00:00, 26.04ba/s]s/it]\n",
+      "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.62s/it]\n",
+      "Pushing dataset shards to the dataset hub: 100%|██████████| 6/6 [02:10<00:00, 21.69s/it]\n"
      ]
     }
    ],
    "source": [
-    "ds = load_dataset(\"loubnabnl/code_reviews_3\", split=\"train\")\n",
-    "size = len(ds)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 223,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sample = ds[1470]\n",
-    "events = sample[\"events\"]\n",
-    "grouped_events = create_grouped_events(events)\n",
-    "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pprint import pprint\n",
-    "\n",
-    "pprint(small_ds[50])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 224,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "📝 **Title**: Fix @inheritDocs behavior<br>\n",
-       "    📦 **GitHub Repo**: Azure/azure-sdk-for-java, PR Number: 26816, ID: 836647691.<br>\n",
-       "    Link: [https://github.com/Azure/azure-sdk-for-java/pull/26816](https://github.com/Azure/azure-sdk-for-java/pull/26816)"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "    <table style=\"width:100%\">\n",
-       "        <tr><th>Attribute</th><th>Detail</th></tr>\n",
-       "        <tr><td>🧾 <strong>PR Type</strong></td><td>issue</td></tr>\n",
-       "        <tr><td>🟢 <strong>PR State</strong></td><td>open</td></tr>\n",
-       "        <tr><td>👤 <strong>PR Author</strong></td><td>kasobol-msft</td></tr>\n",
-       "        <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: kasobol-msft-patch-1, label: Azure:kasobol-msft-patch-1</td></tr>\n",
-       "        <tr><td>🌳 <strong>Base Branch</strong></td><td>main</td></tr>\n",
-       "    </table>\n",
-       "    "
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "Make sure that dependency sources are included in javadoc generation.\r\n",
-       "\r\n",
-       "Fixes https://github.com/Azure/azure-sdk-for-java/issues/26814"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "def get_pr_info(sample):\n",
-    "    pr_info = sample[\"pull_request_info\"]\n",
-    "    head_info = sample[\"head_repo_info\"]\n",
-    "    base_info = sample[\"base_repo_info\"]\n",
-    "    events = sample[\"events\"]\n",
-    "\n",
-    "    gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
-    "\n",
-    "    header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}<br>\n",
-    "    📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.<br>\n",
-    "    Link: [{gh_link}]({gh_link})\"\"\"\n",
-    "    pr_info_html = f\"\"\"\n",
-    "    <table style=\"width:100%\">\n",
-    "        <tr><th>Attribute</th><th>Detail</th></tr>\n",
-    "        <tr><td>🧾 <strong>PR Type</strong></td><td>{events[0]['type']}</td></tr>\n",
-    "        <tr><td>🟢 <strong>PR State</strong></td><td>{pr_info['pull_request.state']}</td></tr>\n",
-    "        <tr><td>👤 <strong>PR Author</strong></td><td>{pr_info['pull_request.user.login']}</td></tr>\n",
-    "        <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}</td></tr>\n",
-    "        <tr><td>🌳 <strong>Base Branch</strong></td><td>{base_info['pull_request.base.ref']}</td></tr>\n",
-    "    </table>\n",
-    "    \"\"\"\n",
-    "    return header, pr_info_html, pr_info['pull_request.body']\n",
-    "\n",
-    "from IPython.display import HTML, display\n",
-    "display(HTML(get_pr_info(sample)[0]))\n",
-    "display(HTML(get_pr_info(sample)[1]))\n",
-    "display(HTML(get_pr_info(sample)[2]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 308,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sample = ds[4]\n",
-    "events = sample[\"events\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 309,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
-      ]
-     },
-     "execution_count": 309,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 310,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'action': 'opened',\n",
-      "  'actor.id': 39814207,\n",
-      "  'actor.login': 'pull[bot]',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2022, 10, 10, 10, 57, 41, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': False,\n",
-      "  'pull_request.merged_by.login': None,\n",
-      "  'pull_request.merged_by.type': None,\n",
-      "  'pull_request.state': 'open',\n",
-      "  'review.author_association': None,\n",
-      "  'review.body': None,\n",
-      "  'review.commit_id': None,\n",
-      "  'review.id': None,\n",
-      "  'review.state': None,\n",
-      "  'review.submitted_at': None,\n",
-      "  'type': 'PullRequestEvent',\n",
-      "  'user.login': None,\n",
-      "  'user.type': None},\n",
-      " {'action': 'closed',\n",
-      "  'actor.id': 39814207,\n",
-      "  'actor.login': 'pull[bot]',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2022, 10, 10, 11, 1, 28, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': True,\n",
-      "  'pull_request.merged_by.login': 'pull[bot]',\n",
-      "  'pull_request.merged_by.type': 'Bot',\n",
-      "  'pull_request.state': 'closed',\n",
-      "  'review.author_association': None,\n",
-      "  'review.body': None,\n",
-      "  'review.commit_id': None,\n",
-      "  'review.id': None,\n",
-      "  'review.state': None,\n",
-      "  'review.submitted_at': None,\n",
-      "  'type': 'PullRequestEvent',\n",
-      "  'user.login': None,\n",
-      "  'user.type': None}]\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "import uuid\n",
-    "\n",
-    "def create_grouped_events(events):\n",
-    "    df = pd.DataFrame(events)\n",
-    "    # Ensure it's in datetime format\n",
-    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "    # Create a new column 'uuid' initialized with None\n",
-    "    df['uuid'] = None\n",
-    "    # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n",
-    "    mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n",
-    "    df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n",
-    "    # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n",
-    "    grouped_events = [group.drop(columns='uuid').to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n",
-    "    return grouped_events\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 229,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "846\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i in range(len(ds)):\n",
-    "    e = ds[i]\n",
-    "    if e[\"events\"][0][\"comment.diff_hunk\"]:\n",
-    "        print(i)\n",
-    "        break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 299,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'action': 'opened',\n",
-      "  'actor.id': 1753262,\n",
-      "  'actor.login': 'mo9a7i',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': False,\n",
-      "  'pull_request.merged_by.login': None,\n",
-      "  'pull_request.merged_by.type': None,\n",
-      "  'pull_request.state': 'open',\n",
-      "  'review.author_association': None,\n",
-      "  'review.body': None,\n",
-      "  'review.commit_id': None,\n",
-      "  'review.id': None,\n",
-      "  'review.state': None,\n",
-      "  'review.submitted_at': None,\n",
-      "  'type': 'PullRequestEvent',\n",
-      "  'user.login': None,\n",
-      "  'user.type': None},\n",
-      " {'action': 'created',\n",
-      "  'actor.id': 1753262,\n",
-      "  'actor.login': 'mo9a7i',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': None,\n",
-      "  'pull_request.merged_by.login': None,\n",
-      "  'pull_request.merged_by.type': None,\n",
-      "  'pull_request.state': 'open',\n",
-      "  'review.author_association': 'MEMBER',\n",
-      "  'review.body': 'looks fine',\n",
-      "  'review.commit_id': 'ba75444d1ada77cf5f3f06cd74b6320bab8db54b',\n",
-      "  'review.id': 962846794,\n",
-      "  'review.state': 'commented',\n",
-      "  'review.submitted_at': '2022-05-05T04:35:02Z',\n",
-      "  'type': 'PullRequestReviewEvent',\n",
-      "  'user.login': 'mo9a7i',\n",
-      "  'user.type': 'User'},\n",
-      " {'action': 'closed',\n",
-      "  'actor.id': 1753262,\n",
-      "  'actor.login': 'mo9a7i',\n",
-      "  'comment.author_association': None,\n",
-      "  'comment.body': None,\n",
-      "  'comment.commit_id': None,\n",
-      "  'comment.created_at': None,\n",
-      "  'comment.diff_hunk': None,\n",
-      "  'comment.id': None,\n",
-      "  'comment.in_reply_to_id': None,\n",
-      "  'comment.line': None,\n",
-      "  'comment.original_commit_id': None,\n",
-      "  'comment.original_line': None,\n",
-      "  'comment.original_position': None,\n",
-      "  'comment.original_start_line': None,\n",
-      "  'comment.path': None,\n",
-      "  'comment.position': None,\n",
-      "  'comment.side': None,\n",
-      "  'comment.start_line': None,\n",
-      "  'comment.start_side': None,\n",
-      "  'comment.updated_at': None,\n",
-      "  'created_at': datetime.datetime(2022, 5, 5, 4, 35, 3, tzinfo=<UTC>),\n",
-      "  'issue.author': None,\n",
-      "  'issue.comment': None,\n",
-      "  'issue.comment_id': None,\n",
-      "  'pull_request.merged': True,\n",
-      "  'pull_request.merged_by.login': 'mo9a7i',\n",
-      "  'pull_request.merged_by.type': 'User',\n",
-      "  'pull_request.state': 'closed',\n",
-      "  'review.author_association': None,\n",
-      "  'review.body': None,\n",
-      "  'review.commit_id': None,\n",
-      "  'review.id': None,\n",
-      "  'review.state': None,\n",
-      "  'review.submitted_at': None,\n",
-      "  'type': 'PullRequestEvent',\n",
-      "  'user.login': None,\n",
-      "  'user.type': None}]\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 303,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>action</th>\n",
-       "      <th>actor.id</th>\n",
-       "      <th>actor.login</th>\n",
-       "      <th>comment.author_association</th>\n",
-       "      <th>comment.body</th>\n",
-       "      <th>comment.commit_id</th>\n",
-       "      <th>comment.created_at</th>\n",
-       "      <th>comment.diff_hunk</th>\n",
-       "      <th>comment.id</th>\n",
-       "      <th>comment.in_reply_to_id</th>\n",
-       "      <th>...</th>\n",
-       "      <th>review.author_association</th>\n",
-       "      <th>review.body</th>\n",
-       "      <th>review.commit_id</th>\n",
-       "      <th>review.id</th>\n",
-       "      <th>review.state</th>\n",
-       "      <th>review.submitted_at</th>\n",
-       "      <th>type</th>\n",
-       "      <th>user.login</th>\n",
-       "      <th>user.type</th>\n",
-       "      <th>group_key</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>opened</td>\n",
-       "      <td>1753262</td>\n",
-       "      <td>mo9a7i</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>PullRequestEvent</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>1.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>created</td>\n",
-       "      <td>1753262</td>\n",
-       "      <td>mo9a7i</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>MEMBER</td>\n",
-       "      <td>looks fine</td>\n",
-       "      <td>ba75444d1ada77cf5f3f06cd74b6320bab8db54b</td>\n",
-       "      <td>962846794.0</td>\n",
-       "      <td>commented</td>\n",
-       "      <td>2022-05-05T04:35:02Z</td>\n",
-       "      <td>PullRequestReviewEvent</td>\n",
-       "      <td>mo9a7i</td>\n",
-       "      <td>User</td>\n",
-       "      <td>ba75444d1ada77cf5f3f06cd74b6320bab8db54b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>closed</td>\n",
-       "      <td>1753262</td>\n",
-       "      <td>mo9a7i</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>...</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>PullRequestEvent</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>2.0</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3 rows × 39 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    action  actor.id actor.login comment.author_association comment.body   \n",
-       "0   opened   1753262      mo9a7i                       None         None  \\\n",
-       "1  created   1753262      mo9a7i                       None         None   \n",
-       "2   closed   1753262      mo9a7i                       None         None   \n",
-       "\n",
-       "  comment.commit_id comment.created_at comment.diff_hunk comment.id   \n",
-       "0              None               None              None       None  \\\n",
-       "1              None               None              None       None   \n",
-       "2              None               None              None       None   \n",
-       "\n",
-       "  comment.in_reply_to_id  ... review.author_association review.body   \n",
-       "0                   None  ...                      None        None  \\\n",
-       "1                   None  ...                    MEMBER  looks fine   \n",
-       "2                   None  ...                      None        None   \n",
-       "\n",
-       "                           review.commit_id    review.id review.state   \n",
-       "0                                      None          NaN         None  \\\n",
-       "1  ba75444d1ada77cf5f3f06cd74b6320bab8db54b  962846794.0    commented   \n",
-       "2                                      None          NaN         None   \n",
-       "\n",
-       "    review.submitted_at                    type user.login user.type   \n",
-       "0                  None        PullRequestEvent       None      None  \\\n",
-       "1  2022-05-05T04:35:02Z  PullRequestReviewEvent     mo9a7i      User   \n",
-       "2                  None        PullRequestEvent       None      None   \n",
-       "\n",
-       "                                  group_key  \n",
-       "0                                       1.0  \n",
-       "1  ba75444d1ada77cf5f3f06cd74b6320bab8db54b  \n",
-       "2                                       2.0  \n",
-       "\n",
-       "[3 rows x 39 columns]"
-      ]
-     },
-     "execution_count": 303,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "df = pd.DataFrame(events)\n",
-    "df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "df.drop_duplicates(inplace=True)\n",
-    "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n",
-    "mask = df['review.commit_id'].isnull()\n",
-    "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n",
-    "df.loc[~mask, 'group_key'] = df['review.commit_id']\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 304,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "df = pd.DataFrame(events)\n",
-    "df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "df.drop_duplicates(inplace=True)\n",
-    "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n",
-    "mask = df['review.commit_id'].isnull()\n",
-    "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n",
-    "df.loc[~mask, 'group_key'] = df['review.commit_id']\n",
-    "\n",
-    "if len(df) == 1:\n",
-    "    grouped_events = [[df.iloc[0].to_dict()]]\n",
-    "else:\n",
-    "    grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n",
-    "\n",
-    "# sort by first event date\n",
-    "grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 311,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_grouped_events(events):\n",
-    "    \"\"\"group events that happened in the same  review thread using review.commit_id\"\"\"\n",
-    "    df = pd.DataFrame(events)\n",
-    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "    df.drop_duplicates(inplace=True)\n",
-    "    # Create a new 'group_key' where rows with NaN 'review.commit_id' get an identical identifier. Otherwise NaN values go in the same group\n",
-    "    mask = df['review.commit_id'].isnull()\n",
-    "    df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n",
-    "    df.loc[~mask, 'group_key'] = df['review.commit_id']\n",
-    "    \n",
-    "    if len(df) == 1:\n",
-    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
-    "    else:\n",
-    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n",
-    "    \n",
-    "    # sort by first event date\n",
-    "    grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n",
-    "    return grouped_events\n",
-    "\n",
-    "grouped_events = create_grouped_events(events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 312,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "len events 2 and len grouped_events 2\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"len events {len(events)} and len grouped_events {len(grouped_events)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 313,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "thread number 0\n",
-      "thread number 1\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div class=\"thread\">\n",
-       "    <div class=\"event\">\n",
-       "        <table style=\"width:100%; border: 0;\">\n",
-       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "            <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
-       "            <tr><td><strong>Action</strong></td><td style=\"\">opened</td></tr>\n",
-       "            \"<tr><td><strong>Review State</strong></td><td>None</td></tr>\"\n",
-       "            <tr><td><strong>PR State</strong></td><td style=\"\">open, merged: False</td></tr>\n",
-       "            <tr><td><strong>Date</strong></td><td>2022-10-10 10:57:41+00:00</td></tr>\n",
-       "        </table>\n",
-       "    </div>\n",
-       "    \n",
-       "---------------------------------------------------------------------------------------------------------------------------------------------------------------------</div><div class=\"thread\">\n",
-       "    <div class=\"event\">\n",
-       "        <table style=\"width:100%; border: 0;\">\n",
-       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "            <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
-       "            <tr><td><strong>Action</strong></td><td style=\"background-color: #FFCFCF;\">closed</td></tr>\n",
-       "            \"<tr><td><strong>Review State</strong></td><td>None</td></tr>\"\n",
-       "            <tr><td><strong>PR State</strong></td><td style=\"background-color: #FFCFCF;\">closed, merged: True</td></tr>\n",
-       "            <tr><td><strong>Date</strong></td><td>2022-10-10 11:01:28+00:00</td></tr>\n",
-       "        </table>\n",
-       "    </div>\n",
-       "    \n",
-       "---------------------------------------------------------------------------------------------------------------------------------------------------------------------</div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "\n",
-    "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
-    "thread_html = \"\"\n",
-    "c = 0\n",
-    "for thread in grouped_events:\n",
-    "    print(f\"thread number {c}\")\n",
-    "    c += 1\n",
-    "    thread_html += '<div class=\"thread\">'\n",
-    "    # Get the first event in the thread as a reference\n",
-    "    first_event = thread[0]\n",
-    "    poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n",
-    "    # Add shared parts of the events only once\n",
-    "    user_type = f\"(type :<strong>{first_event['user.type']}</strong>)\" if first_event['user.type'] else \"\"\n",
-    "    review_state = f\"<tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\" if first_event['review.state'] else \"\"\n",
-    "    text = f\"\"\"\n",
-    "    <div class=\"event\">\n",
-    "        <table style=\"width:100%; border: 0;\">\n",
-    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
-    "            <tr><td><strong>User</strong></td><td>{poster_name} {user_type}</td></tr>\n",
-    "            <tr><td><strong>Action</strong></td><td>{first_event['action']}</td></tr>\n",
-    "            {review_state}\n",
-    "            <tr><td><strong>PR State</strong></td><td>{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
-    "            <tr><td><strong>Date</strong></td><td>{first_event['created_at']}</td></tr>\n",
-    "        </table>\n",
-    "    </div>\n",
-    "    \"\"\"\n",
-    "    highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n",
-    "    highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n",
-    "\n",
-    "    text = f\"\"\"\n",
-    "    <div class=\"event\">\n",
-    "        <table style=\"width:100%; border: 0;\">\n",
-    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
-    "            <tr><td><strong>User</strong></td><td>{poster_name} {user_type}</td></tr>\n",
-    "            <tr><td><strong>Action</strong></td><td style=\"{highlight_action}\">{first_event['action']}</td></tr>\n",
-    "            \"<tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\"\n",
-    "            <tr><td><strong>PR State</strong></td><td style=\"{highlight_pr_state}\">{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
-    "            <tr><td><strong>Date</strong></td><td>{first_event['created_at']}</td></tr>\n",
-    "        </table>\n",
-    "    </div>\n",
-    "    \"\"\"\n",
-    "\n",
-    "\n",
-    "    thread_html += text\n",
-    "    thread_html += (\"\\n\" + \"-\"*165)\n",
-    "    # Add the bodies of the comments for each event in the thread\n",
-    "    for event in thread:\n",
-    "        # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n",
-    "        poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n",
-    "        if event['comment.body'] or event[\"issue.comment\"]:\n",
-    "            is_op = original_poster == poster_name\n",
-    "            thread_html += format_body(event['comment.body'], poster_name, is_op)\n",
-    "\n",
-    "    thread_html += '</div>'\n",
-    "\n",
-    "display(HTML(thread_html))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 314,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def display_events(sample):\n",
-    "    events = sample[\"events\"]\n",
-    "    grouped_events = create_grouped_events(events)\n",
-    "    original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
-    "    for thread in grouped_events:\n",
-    "        thread_html = '<div class=\"thread\">'\n",
-    "        # Get the first event in the thread as a reference\n",
-    "        first_event = thread[0]\n",
-    "        poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n",
-    "        # Add shared parts of the events only once\n",
-    "        user_type = f\"(type :<strong>{first_event['user.type']}</strong>)\" if first_event['user.type'] else \"\"\n",
-    "        highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n",
-    "        highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n",
-    "    \n",
-    "        text = f\"\"\"\n",
-    "        <div class=\"event\">\n",
-    "            <table style=\"width:100%; border: 0;\">\n",
-    "                <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
-    "                <tr><td><strong>User</strong></td><td>{poster_name} {user_type}</td></tr>\n",
-    "                <tr><td><strong>Action</strong></td><td style='{highlight_action}'>{first_event['action']}</td></tr>\n",
-    "                <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
-    "                <tr><td><strong>PR State</strong></td><td style=\"{highlight_pr_state}\">{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
-    "                <tr><td><strong>Date</strong></td><td>{first_event['created_at']}</td></tr>\n",
-    "            </table>\n",
-    "        </div>\n",
-    "        \"\"\"\n",
-    "        print(f\"added first event of teh group\")\n",
-    "        thread_html += text\n",
-    "        \n",
-    "        # Add the bodies of the comments for each event in the thread\n",
-    "        for event in thread:\n",
-    "            # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n",
-    "            poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n",
-    "            if event['comment.body'] or event[\"issue.comment\"]:\n",
-    "                is_op = original_poster == poster_name\n",
-    "                thread_html += format_body(event['comment.body'], poster_name, is_op)\n",
-    "\n",
-    "        thread_html += '</div>'\n",
-    "        display(HTML(thread_html))\n",
-    "        if first_event['comment.path']:\n",
-    "            path_html = f\"<code style='font-family: monospace; font-weight: bold;'>Path:</code> {first_event['comment.path']}\"\n",
-    "            display(HTML(path_html))\n",
-    "        display(HTML(\"---\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 316,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[[{'action': 'opened',\n",
-       "   'actor.id': 39814207,\n",
-       "   'actor.login': 'pull[bot]',\n",
-       "   'comment.author_association': None,\n",
-       "   'comment.body': None,\n",
-       "   'comment.commit_id': None,\n",
-       "   'comment.created_at': None,\n",
-       "   'comment.diff_hunk': None,\n",
-       "   'comment.id': None,\n",
-       "   'comment.in_reply_to_id': None,\n",
-       "   'comment.line': None,\n",
-       "   'comment.original_commit_id': None,\n",
-       "   'comment.original_line': None,\n",
-       "   'comment.original_position': None,\n",
-       "   'comment.original_start_line': None,\n",
-       "   'comment.path': None,\n",
-       "   'comment.position': None,\n",
-       "   'comment.side': None,\n",
-       "   'comment.start_line': None,\n",
-       "   'comment.start_side': None,\n",
-       "   'comment.updated_at': None,\n",
-       "   'created_at': Timestamp('2022-10-10 10:57:41+0000', tz='UTC'),\n",
-       "   'issue.author': None,\n",
-       "   'issue.comment': None,\n",
-       "   'issue.comment_id': None,\n",
-       "   'pull_request.merged': False,\n",
-       "   'pull_request.merged_by.login': None,\n",
-       "   'pull_request.merged_by.type': None,\n",
-       "   'pull_request.state': 'open',\n",
-       "   'review.author_association': None,\n",
-       "   'review.body': None,\n",
-       "   'review.commit_id': None,\n",
-       "   'review.id': None,\n",
-       "   'review.state': None,\n",
-       "   'review.submitted_at': None,\n",
-       "   'type': 'PullRequestEvent',\n",
-       "   'user.login': None,\n",
-       "   'user.type': None,\n",
-       "   'group_key': 1.0}],\n",
-       " [{'action': 'closed',\n",
-       "   'actor.id': 39814207,\n",
-       "   'actor.login': 'pull[bot]',\n",
-       "   'comment.author_association': None,\n",
-       "   'comment.body': None,\n",
-       "   'comment.commit_id': None,\n",
-       "   'comment.created_at': None,\n",
-       "   'comment.diff_hunk': None,\n",
-       "   'comment.id': None,\n",
-       "   'comment.in_reply_to_id': None,\n",
-       "   'comment.line': None,\n",
-       "   'comment.original_commit_id': None,\n",
-       "   'comment.original_line': None,\n",
-       "   'comment.original_position': None,\n",
-       "   'comment.original_start_line': None,\n",
-       "   'comment.path': None,\n",
-       "   'comment.position': None,\n",
-       "   'comment.side': None,\n",
-       "   'comment.start_line': None,\n",
-       "   'comment.start_side': None,\n",
-       "   'comment.updated_at': None,\n",
-       "   'created_at': Timestamp('2022-10-10 11:01:28+0000', tz='UTC'),\n",
-       "   'issue.author': None,\n",
-       "   'issue.comment': None,\n",
-       "   'issue.comment_id': None,\n",
-       "   'pull_request.merged': True,\n",
-       "   'pull_request.merged_by.login': 'pull[bot]',\n",
-       "   'pull_request.merged_by.type': 'Bot',\n",
-       "   'pull_request.state': 'closed',\n",
-       "   'review.author_association': None,\n",
-       "   'review.body': None,\n",
-       "   'review.commit_id': None,\n",
-       "   'review.id': None,\n",
-       "   'review.state': None,\n",
-       "   'review.submitted_at': None,\n",
-       "   'type': 'PullRequestEvent',\n",
-       "   'user.login': None,\n",
-       "   'user.type': None,\n",
-       "   'group_key': 2.0}]]"
-      ]
-     },
-     "execution_count": 316,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "grouped_events"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 315,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div class=\"thread\">\n",
-       "        <div class=\"event\">\n",
-       "            <table style=\"width:100%; border: 0;\">\n",
-       "                <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "                <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
-       "                <tr><td><strong>Action</strong></td><td style=''>opened</td></tr>\n",
-       "                <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
-       "                <tr><td><strong>PR State</strong></td><td style=\"\">open, merged: False</td></tr>\n",
-       "                <tr><td><strong>Date</strong></td><td>2022-10-10 10:57:41+00:00</td></tr>\n",
-       "            </table>\n",
-       "        </div>\n",
-       "        </div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "---"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div class=\"thread\">\n",
-       "        <div class=\"event\">\n",
-       "            <table style=\"width:100%; border: 0;\">\n",
-       "                <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "                <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
-       "                <tr><td><strong>Action</strong></td><td style=''>opened</td></tr>\n",
-       "                <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
-       "                <tr><td><strong>PR State</strong></td><td style=\"\">open, merged: False</td></tr>\n",
-       "                <tr><td><strong>Date</strong></td><td>2022-10-10 10:57:41+00:00</td></tr>\n",
-       "            </table>\n",
-       "        </div>\n",
-       "        </div><div class=\"thread\">\n",
-       "        <div class=\"event\">\n",
-       "            <table style=\"width:100%; border: 0;\">\n",
-       "                <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "                <tr><td><strong>User</strong></td><td>pull[bot] </td></tr>\n",
-       "                <tr><td><strong>Action</strong></td><td style='background-color: #FFCFCF;'>closed</td></tr>\n",
-       "                <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
-       "                <tr><td><strong>PR State</strong></td><td style=\"background-color: #FFCFCF;\">closed, merged: True</td></tr>\n",
-       "                <tr><td><strong>Date</strong></td><td>2022-10-10 11:01:28+00:00</td></tr>\n",
-       "            </table>\n",
-       "        </div>\n",
-       "        </div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "---"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "display_events(sample)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 261,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "single\n",
-      "2022-05-05 04:35:02+00:00\n",
-      "with review state: commented\n",
-      "None\n",
-      "User: mo9a7i, action: created\n",
-      "PullRequestReviewEvent\n",
-      "------------\n",
-      "multiple\n",
-      "single\n",
-      "2022-05-05 04:35:02+00:00\n",
-      "with review state: None\n",
-      "None\n",
-      "User: mo9a7i, action: opened\n",
-      "PullRequestEvent\n",
-      "------------\n",
-      "------------\n",
-      "2022-05-05 04:35:02+00:00\n",
-      "with review state: None PR state False\n",
-      "None\n",
-      "User: mo9a7i, action: closed\n",
-      "PullRequestEvent\n",
-      "------------\n",
-      "------------end multiple\n"
-     ]
-    }
-   ],
-   "source": [
-    "for group in grouped_events:\n",
-    "    if len(group) == 1:\n",
-    "        poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n",
-    "        print(\"single\")\n",
-    "        print(group[0][\"created_at\"])\n",
-    "        print(f\"with review state: {group[0]['review.state']}\")\n",
-    "        print(group[0][\"comment.body\"])\n",
-    "        # print action type and user\n",
-    "        print(f\"User: {poster_name}, action: {group[0]['action']}\")\n",
-    "        print(group[0][\"type\"])\n",
-    "        print(\"------------\")\n",
-    "        continue\n",
-    "    # date \n",
-    "    else:\n",
-    "        print(\"multiple\")\n",
-    "        poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n",
-    "        print(\"single\")\n",
-    "        print(group[0][\"created_at\"])\n",
-    "        print(f\"with review state: {group[0]['review.state']}\")\n",
-    "        print(group[0][\"comment.body\"])\n",
-    "        # print action type and user\n",
-    "        print(f\"User: {poster_name}, action: {group[0]['action']}\")\n",
-    "        print(group[0][\"type\"])\n",
-    "        print(\"------------\")\n",
-    "        print(\"------------\")\n",
-    "        for e in group[1:]:\n",
-    "            print(group[0][\"created_at\"])\n",
-    "            print(f\"with review state: {group[0]['review.state']} PR state {group[0]['pull_request.merged']}\")\n",
-    "            print(e[\"comment.body\"])\n",
-    "            poster_name = e['actor.login'] or e['issue.author'] or e['user.login']\n",
-    "            print(f\"User: {poster_name}, action: {e['action']}\")\n",
-    "            print(e[\"type\"])\n",
-    "            print(\"------------\")\n",
-    "        print(\"------------end multiple\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 225,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_grouped_events(events):\n",
-    "    df = pd.DataFrame(events)\n",
-    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "    df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n",
-    "    # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n",
-    "    if len(df) == 1:\n",
-    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
-    "    else:\n",
-    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'pull_request.state'], dropna=False)]\n",
-    "    return grouped_events\n",
-    "\n",
-    "def format_body(text, user, is_op=False):\n",
-    "    color = \"#007bff\" if is_op else \"black\"\n",
-    "    pr_body = f\"<div style='background-color: #f0f0f0; padding: 10px;'>👤<strong style='color: {color};'>{user}</strong>: {text}</div>\"\n",
-    "    return pr_body"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 220,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import uuid\n",
-    "import pandas as pd\n",
-    "\n",
-    "def create_grouped_events(events):\n",
-    "    df = pd.DataFrame(events)\n",
-    "    \n",
-    "    # Ensure it's in datetime format\n",
-    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "    # Preserve the original order\n",
-    "    df['order'] = range(len(df))\n",
-    "\n",
-    "    # Create a new column 'uuid' initialized with None\n",
-    "    df['uuid'] = None\n",
-    "\n",
-    "    # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n",
-    "    mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n",
-    "    df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n",
-    "\n",
-    "    # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n",
-    "    grouped_events = [group.drop(columns=['uuid', 'order']).to_dict(orient='records') \n",
-    "                      for _, group in df.sort_values(by='order').groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n",
-    "    # soert on created_at\n",
-    "    grouped_events = [sorted(group, key=lambda x: x['created_at']) for group in grouped_events]\n",
-    "    return grouped_events\n",
-    "\n",
-    "\n",
-    "\n",
-    "grouped_events = create_grouped_events(events)\n",
-    "c = 0\n",
-    "thread_html = \"\"\n",
-    "for thread in grouped_events:\n",
-    "    # Start a new thread\n",
-    "    #print(thread)\n",
-    "    if thread[0][\"action\"] == \"opened\":\n",
-    "        continue\n",
-    "    thread_html += '<div class=\"thread\">'\n",
-    "    # Get the first event in the thread as a reference\n",
-    "    first_event = thread[0]\n",
-    "    poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n",
-    "    # Add shared parts of the events only once\n",
-    "    text = f\"\"\"\n",
-    "    <div class=\"event\">\n",
-    "        <table style=\"width:100%; border: 0;\">\n",
-    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
-    "            <tr><td><strong>User</strong></td><td>{poster_name} (type :<strong>{first_event['user.type']}</strong>)</td></tr>\n",
-    "            <tr><td><strong>Action</strong></td><td>{first_event['action']}</td></tr>\n",
-    "            <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
-    "            <tr><td><strong>PR State</strong></td><td>{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}</td></tr>\n",
-    "            <tr><td><strong>From Head</strong></td><td>{sample[\"head_repo_info\"]['pull_request.head.label']}</td></tr>\n",
-    "        </table>\n",
-    "    </div>\n",
-    "    \"\"\"\n",
-    "    thread_html += text\n",
-    "    # add horizontal line\n",
-    "    thread_html += '<hr>'\n",
-    "    for event in thread:\n",
-    "        # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n",
-    "        poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n",
-    "        if event['comment.body'] or event[\"issue.comment\"]:\n",
-    "            is_op = original_poster == poster_name\n",
-    "            thread_html += format_body(event['comment.body'], poster_name, is_op)\n",
-    "\n",
-    "    thread_html += '</div>'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 218,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "4"
-      ]
-     },
-     "execution_count": 218,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(grouped_events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 221,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[[{'action': 'created',\n",
-      "   'actor.id': nan,\n",
-      "   'actor.login': None,\n",
-      "   'comment.author_association': None,\n",
-      "   'comment.body': None,\n",
-      "   'comment.commit_id': None,\n",
-      "   'comment.created_at': None,\n",
-      "   'comment.diff_hunk': None,\n",
-      "   'comment.id': None,\n",
-      "   'comment.in_reply_to_id': None,\n",
-      "   'comment.line': None,\n",
-      "   'comment.original_commit_id': None,\n",
-      "   'comment.original_line': None,\n",
-      "   'comment.original_position': None,\n",
-      "   'comment.original_start_line': None,\n",
-      "   'comment.path': None,\n",
-      "   'comment.position': None,\n",
-      "   'comment.side': None,\n",
-      "   'comment.start_line': None,\n",
-      "   'comment.start_side': None,\n",
-      "   'comment.updated_at': None,\n",
-      "   'created_at': Timestamp('2022-02-01 00:05:19+0000', tz='UTC'),\n",
-      "   'issue.author': 'kasobol-msft',\n",
-      "   'issue.comment': \"This won't work well because it includes dependencies in \"\n",
-      "                    'output like this:\\r\\n'\n",
-      "                    '![image](https://user-images.githubusercontent.com/61715331/151893024-ef3e99d9-0d83-44c6-839b-966550320642.png)\\r\\n'\n",
-      "                    '\\r\\n'\n",
-      "                    \"There's hacky way to side step this:\\r\\n\"\n",
-      "                    '![image](https://user-images.githubusercontent.com/61715331/151893056-8d018cb9-2f0d-4c7d-8848-eb9df9028b88.png)\\r\\n'\n",
-      "                    '\\r\\n'\n",
-      "                    'But it would require be explicit about each dependency in '\n",
-      "                    'each sdk to be precise and not risk any \"dependency doc '\n",
-      "                    'leaks\".',\n",
-      "   'issue.comment_id': 1026335328.0,\n",
-      "   'pull_request.merged': None,\n",
-      "   'pull_request.merged_by.login': None,\n",
-      "   'pull_request.merged_by.type': None,\n",
-      "   'pull_request.state': None,\n",
-      "   'review.author_association': None,\n",
-      "   'review.body': None,\n",
-      "   'review.commit_id': None,\n",
-      "   'review.id': None,\n",
-      "   'review.state': None,\n",
-      "   'review.submitted_at': None,\n",
-      "   'type': 'comment',\n",
-      "   'user.login': None,\n",
-      "   'user.type': None}],\n",
-      " [{'action': 'opened',\n",
-      "   'actor.id': 61715331.0,\n",
-      "   'actor.login': 'kasobol-msft',\n",
-      "   'comment.author_association': None,\n",
-      "   'comment.body': None,\n",
-      "   'comment.commit_id': None,\n",
-      "   'comment.created_at': None,\n",
-      "   'comment.diff_hunk': None,\n",
-      "   'comment.id': None,\n",
-      "   'comment.in_reply_to_id': None,\n",
-      "   'comment.line': None,\n",
-      "   'comment.original_commit_id': None,\n",
-      "   'comment.original_line': None,\n",
-      "   'comment.original_position': None,\n",
-      "   'comment.original_start_line': None,\n",
-      "   'comment.path': None,\n",
-      "   'comment.position': None,\n",
-      "   'comment.side': None,\n",
-      "   'comment.start_line': None,\n",
-      "   'comment.start_side': None,\n",
-      "   'comment.updated_at': None,\n",
-      "   'created_at': Timestamp('2022-01-31 22:51:21+0000', tz='UTC'),\n",
-      "   'issue.author': None,\n",
-      "   'issue.comment': None,\n",
-      "   'issue.comment_id': nan,\n",
-      "   'pull_request.merged': False,\n",
-      "   'pull_request.merged_by.login': None,\n",
-      "   'pull_request.merged_by.type': None,\n",
-      "   'pull_request.state': 'open',\n",
-      "   'review.author_association': None,\n",
-      "   'review.body': None,\n",
-      "   'review.commit_id': None,\n",
-      "   'review.id': None,\n",
-      "   'review.state': None,\n",
-      "   'review.submitted_at': None,\n",
-      "   'type': 'PullRequestEvent',\n",
-      "   'user.login': None,\n",
-      "   'user.type': None}],\n",
-      " [{'action': 'opened',\n",
-      "   'actor.id': nan,\n",
-      "   'actor.login': None,\n",
-      "   'comment.author_association': None,\n",
-      "   'comment.body': None,\n",
-      "   'comment.commit_id': None,\n",
-      "   'comment.created_at': None,\n",
-      "   'comment.diff_hunk': None,\n",
-      "   'comment.id': None,\n",
-      "   'comment.in_reply_to_id': None,\n",
-      "   'comment.line': None,\n",
-      "   'comment.original_commit_id': None,\n",
-      "   'comment.original_line': None,\n",
-      "   'comment.original_position': None,\n",
-      "   'comment.original_start_line': None,\n",
-      "   'comment.path': None,\n",
-      "   'comment.position': None,\n",
-      "   'comment.side': None,\n",
-      "   'comment.start_line': None,\n",
-      "   'comment.start_side': None,\n",
-      "   'comment.updated_at': None,\n",
-      "   'created_at': Timestamp('2022-01-31 22:51:20+0000', tz='UTC'),\n",
-      "   'issue.author': 'kasobol-msft',\n",
-      "   'issue.comment': None,\n",
-      "   'issue.comment_id': nan,\n",
-      "   'pull_request.merged': None,\n",
-      "   'pull_request.merged_by.login': None,\n",
-      "   'pull_request.merged_by.type': None,\n",
-      "   'pull_request.state': None,\n",
-      "   'review.author_association': None,\n",
-      "   'review.body': None,\n",
-      "   'review.commit_id': None,\n",
-      "   'review.id': None,\n",
-      "   'review.state': None,\n",
-      "   'review.submitted_at': None,\n",
-      "   'type': 'issue',\n",
-      "   'user.login': None,\n",
-      "   'user.type': None}],\n",
-      " [{'action': 'closed',\n",
-      "   'actor.id': 61715331.0,\n",
-      "   'actor.login': 'kasobol-msft',\n",
-      "   'comment.author_association': None,\n",
-      "   'comment.body': None,\n",
-      "   'comment.commit_id': None,\n",
-      "   'comment.created_at': None,\n",
-      "   'comment.diff_hunk': None,\n",
-      "   'comment.id': None,\n",
-      "   'comment.in_reply_to_id': None,\n",
-      "   'comment.line': None,\n",
-      "   'comment.original_commit_id': None,\n",
-      "   'comment.original_line': None,\n",
-      "   'comment.original_position': None,\n",
-      "   'comment.original_start_line': None,\n",
-      "   'comment.path': None,\n",
-      "   'comment.position': None,\n",
-      "   'comment.side': None,\n",
-      "   'comment.start_line': None,\n",
-      "   'comment.start_side': None,\n",
-      "   'comment.updated_at': None,\n",
-      "   'created_at': Timestamp('2022-02-01 00:05:20+0000', tz='UTC'),\n",
-      "   'issue.author': None,\n",
-      "   'issue.comment': None,\n",
-      "   'issue.comment_id': nan,\n",
-      "   'pull_request.merged': False,\n",
-      "   'pull_request.merged_by.login': None,\n",
-      "   'pull_request.merged_by.type': None,\n",
-      "   'pull_request.state': 'closed',\n",
-      "   'review.author_association': None,\n",
-      "   'review.body': None,\n",
-      "   'review.commit_id': None,\n",
-      "   'review.id': None,\n",
-      "   'review.state': None,\n",
-      "   'review.submitted_at': None,\n",
-      "   'type': 'PullRequestEvent',\n",
-      "   'user.login': None,\n",
-      "   'user.type': None}]]\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(grouped_events)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 193,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div class=\"thread\">\n",
-       "    <div class=\"event\">\n",
-       "        <table style=\"width:100%; border: 0;\">\n",
-       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "            <tr><td><strong>User</strong></td><td>kasobol-msft (type :<strong>None</strong>)</td></tr>\n",
-       "            <tr><td><strong>Action</strong></td><td>closed</td></tr>\n",
-       "            <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
-       "            <tr><td><strong>PR State</strong></td><td>closed, merged: False</td></tr>\n",
-       "            <tr><td><strong>From Head</strong></td><td>Azure:kasobol-msft-patch-1</td></tr>\n",
-       "        </table>\n",
-       "    </div>\n",
-       "    <hr></div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "from IPython.display import HTML, display\n",
-    "display(HTML(thread_html))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n",
-       " 'pull_request.base.ref': 'master',\n",
-       " 'pull_request.base.repo.default_branch': 'master',\n",
-       " 'pull_request.base.repo.description': None,\n",
-       " 'pull_request.base.repo.forks_count': 0,\n",
-       " 'pull_request.base.repo.homepage': None,\n",
-       " 'pull_request.base.repo.language': 'TypeScript',\n",
-       " 'pull_request.base.repo.license.name': None,\n",
-       " 'pull_request.base.repo.name': 'WuzuufMasr',\n",
-       " 'pull_request.base.repo.open_issues_count': 24,\n",
-       " 'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n",
-       " 'pull_request.base.repo.owner.type': 'User',\n",
-       " 'pull_request.base.repo.private': False,\n",
-       " 'pull_request.base.repo.stargazers_count': 0,\n",
-       " 'pull_request.base.repo.watchers_count': 0,\n",
-       " 'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n",
-       " 'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n",
-       " 'pull_request.base.user.type': 'User',\n",
-       " 'pull_request.comments': 0,\n",
-       " 'pull_request.label.name': None,\n",
-       " 'pull_request.review_comments': 0}"
-      ]
-     },
-     "execution_count": 92,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "merged_ds[0][\"base_repo_info\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ds = merged_ds"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 321,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'__index_level_0__': 175,\n",
-      " 'bucket': '940',\n",
-      " 'pull_request.code_review_events': None,\n",
-      " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n",
-      "                        '\"actor.login\": \"pkarman\", \"actor.id\": 1205061, '\n",
-      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
-      "                        'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n",
-      "                        '\"public\": true, \"created_at\": \"2015-11-23T19:16:36Z\", '\n",
-      "                        '\"org.id\": 6233994, \"org.login\": \"18F\", '\n",
-      "                        '\"pull_request.id\": 51566831, \"pull_request.number\": '\n",
-      "                        '820, \"pull_request.state\": \"open\", '\n",
-      "                        '\"pull_request.title\": \"rename elk services to '\n",
-      "                        'workaround blue-green deploy bug\", '\n",
-      "                        '\"pull_request.body\": \"there\\'s a bug in the '\n",
-      "                        'cf-blue-green deploy that gets a false positive match '\n",
-      "                        'based on the current ELK naming convention. I have '\n",
-      "                        're-named all our ELK services to workaround that '\n",
-      "                        'bug.\", \"pull_request.user.login\": \"pkarman\", '\n",
-      "                        '\"pull_request.user.id\": 1205061, '\n",
-      "                        '\"pull_request.author_association\": null, '\n",
-      "                        '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n",
-      "                        '\"pull_request.updated_at\": \"2015-11-23T19:16:34Z\", '\n",
-      "                        '\"pull_request.closed_at\": null, '\n",
-      "                        '\"pull_request.merged_at\": null, '\n",
-      "                        '\"pull_request.merge_commit_sha\": '\n",
-      "                        '\"4b1557970247cde19eb3ea3992c324174d49a3d7\", '\n",
-      "                        '\"pull_request.locked\": false, '\n",
-      "                        '\"pull_request.assignee.login\": null, '\n",
-      "                        '\"pull_request.assignee.id\": null, '\n",
-      "                        '\"pull_request.assignee.type\": null, '\n",
-      "                        '\"pull_request.assignee.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.id\": null, '\n",
-      "                        '\"pull_request.milestone.number\": null, '\n",
-      "                        '\"pull_request.milestone.title\": null, '\n",
-      "                        '\"pull_request.milestone.description\": null, '\n",
-      "                        '\"pull_request.milestone.creator.login\": null, '\n",
-      "                        '\"pull_request.milestone.creator.id\": null, '\n",
-      "                        '\"pull_request.milestone.creator.type\": null, '\n",
-      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.open_issues\": null, '\n",
-      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
-      "                        '\"pull_request.milestone.state\": null, '\n",
-      "                        '\"pull_request.milestone.created_at\": null, '\n",
-      "                        '\"pull_request.milestone.updated_at\": null, '\n",
-      "                        '\"pull_request.milestone.due_on\": null, '\n",
-      "                        '\"pull_request.milestone.closed_at\": null, '\n",
-      "                        '\"pull_request.merged\": false, '\n",
-      "                        '\"pull_request.mergeable\": true, '\n",
-      "                        '\"pull_request.mergeable_state\": \"clean\", '\n",
-      "                        '\"pull_request.merged_by.login\": null, '\n",
-      "                        '\"pull_request.merged_by.id\": null, '\n",
-      "                        '\"pull_request.merged_by.type\": null, '\n",
-      "                        '\"pull_request.merged_by.site_admin\": null, '\n",
-      "                        '\"pull_request.comments\": 0, '\n",
-      "                        '\"pull_request.review_comments\": 0, '\n",
-      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
-      "                        '3, \"pull_request.deletions\": 3, '\n",
-      "                        '\"pull_request.changed_files\": 1, '\n",
-      "                        '\"pull_request.label.id\": null, '\n",
-      "                        '\"pull_request.label.name\": null, '\n",
-      "                        '\"pull_request.label.color\": null, '\n",
-      "                        '\"pull_request.label.default\": null, '\n",
-      "                        '\"pull_request.head.label\": \"18F:elk-rename\", '\n",
-      "                        '\"pull_request.head.ref\": \"elk-rename\", '\n",
-      "                        '\"pull_request.head.sha\": '\n",
-      "                        '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n",
-      "                        '\"pull_request.head.user.login\": \"18F\", '\n",
-      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.name\": \"C2\", '\n",
-      "                        '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n",
-      "                        '\"pull_request.head.repo.owner.login\": \"18F\", '\n",
-      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.private\": false, '\n",
-      "                        '\"pull_request.head.repo.homepage\": '\n",
-      "                        '\"https://cap.18f.gov\", '\n",
-      "                        '\"pull_request.head.repo.description\": \"an approval '\n",
-      "                        'process automation tool\", '\n",
-      "                        '\"pull_request.head.repo.fork\": false, '\n",
-      "                        '\"pull_request.head.repo.created_at\": '\n",
-      "                        '\"2014-03-28T05:15:23Z\", '\n",
-      "                        '\"pull_request.head.repo.updated_at\": '\n",
-      "                        '\"2015-11-06T02:16:44Z\", '\n",
-      "                        '\"pull_request.head.repo.pushed_at\": '\n",
-      "                        '\"2015-11-23T19:16:35Z\", '\n",
-      "                        '\"pull_request.head.repo.size\": 81432, '\n",
-      "                        '\"pull_request.head.repo.stargazers_count\": 31, '\n",
-      "                        '\"pull_request.head.repo.watchers_count\": 31, '\n",
-      "                        '\"pull_request.head.repo.language\": \"Ruby\", '\n",
-      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.head.repo.has_projects\": null, '\n",
-      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.head.repo.has_wiki\": false, '\n",
-      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.head.repo.forks_count\": 16, '\n",
-      "                        '\"pull_request.head.repo.archived\": null, '\n",
-      "                        '\"pull_request.head.repo.disabled\": null, '\n",
-      "                        '\"pull_request.head.repo.open_issues_count\": 6, '\n",
-      "                        '\"pull_request.head.repo.forks\": 16, '\n",
-      "                        '\"pull_request.head.repo.open_issues\": 6, '\n",
-      "                        '\"pull_request.head.repo.watchers\": 31, '\n",
-      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.head.repo.license.key\": null, '\n",
-      "                        '\"pull_request.head.repo.license.spdx_id\": null, '\n",
-      "                        '\"pull_request.head.repo.license.name\": null, '\n",
-      "                        '\"pull_request.base.label\": \"18F:master\", '\n",
-      "                        '\"pull_request.base.ref\": \"master\", '\n",
-      "                        '\"pull_request.base.sha\": '\n",
-      "                        '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n",
-      "                        '\"pull_request.base.user.login\": \"18F\", '\n",
-      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.name\": \"C2\", '\n",
-      "                        '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n",
-      "                        '\"pull_request.base.repo.owner.login\": \"18F\", '\n",
-      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.private\": false, '\n",
-      "                        '\"pull_request.base.repo.homepage\": '\n",
-      "                        '\"https://cap.18f.gov\", '\n",
-      "                        '\"pull_request.base.repo.description\": \"an approval '\n",
-      "                        'process automation tool\", '\n",
-      "                        '\"pull_request.base.repo.fork\": false, '\n",
-      "                        '\"pull_request.base.repo.created_at\": '\n",
-      "                        '\"2014-03-28T05:15:23Z\", '\n",
-      "                        '\"pull_request.base.repo.updated_at\": '\n",
-      "                        '\"2015-11-06T02:16:44Z\", '\n",
-      "                        '\"pull_request.base.repo.pushed_at\": '\n",
-      "                        '\"2015-11-23T19:16:35Z\", '\n",
-      "                        '\"pull_request.base.repo.size\": 81432, '\n",
-      "                        '\"pull_request.base.repo.stargazers_count\": 31, '\n",
-      "                        '\"pull_request.base.repo.watchers_count\": 31, '\n",
-      "                        '\"pull_request.base.repo.language\": \"Ruby\", '\n",
-      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.base.repo.has_projects\": null, '\n",
-      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.base.repo.has_wiki\": false, '\n",
-      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.base.repo.forks_count\": 16, '\n",
-      "                        '\"pull_request.base.repo.archived\": null, '\n",
-      "                        '\"pull_request.base.repo.disabled\": null, '\n",
-      "                        '\"pull_request.base.repo.open_issues_count\": 6, '\n",
-      "                        '\"pull_request.base.repo.forks\": 16, '\n",
-      "                        '\"pull_request.base.repo.open_issues\": 6, '\n",
-      "                        '\"pull_request.base.repo.watchers\": 31, '\n",
-      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.base.repo.license.key\": null, '\n",
-      "                        '\"pull_request.base.repo.license.spdx_id\": null, '\n",
-      "                        '\"pull_request.base.repo.license.name\": null, '\n",
-      "                        '\"pull_request.guid\": \"18F/C2/pull/820\"}, {\"type\": '\n",
-      "                        '\"PullRequestEvent\", \"action\": \"closed\", '\n",
-      "                        '\"actor.login\": \"jessieay\", \"actor.id\": 601515, '\n",
-      "                        '\"user.login\": null, \"user.id\": null, \"user.type\": '\n",
-      "                        'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n",
-      "                        '\"public\": true, \"created_at\": \"2015-11-23T22:09:46Z\", '\n",
-      "                        '\"org.id\": 6233994, \"org.login\": \"18F\", '\n",
-      "                        '\"pull_request.id\": 51566831, \"pull_request.number\": '\n",
-      "                        '820, \"pull_request.state\": \"closed\", '\n",
-      "                        '\"pull_request.title\": \"rename elk services to '\n",
-      "                        'workaround blue-green deploy bug\", '\n",
-      "                        '\"pull_request.body\": \"there\\'s a bug in the '\n",
-      "                        'cf-blue-green deploy that gets a false positive match '\n",
-      "                        'based on the current ELK naming convention. I have '\n",
-      "                        're-named all our ELK services to workaround that '\n",
-      "                        'bug.\", \"pull_request.user.login\": \"pkarman\", '\n",
-      "                        '\"pull_request.user.id\": 1205061, '\n",
-      "                        '\"pull_request.author_association\": null, '\n",
-      "                        '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n",
-      "                        '\"pull_request.updated_at\": \"2015-11-23T22:09:45Z\", '\n",
-      "                        '\"pull_request.closed_at\": \"2015-11-23T22:09:45Z\", '\n",
-      "                        '\"pull_request.merged_at\": \"2015-11-23T22:09:45Z\", '\n",
-      "                        '\"pull_request.merge_commit_sha\": '\n",
-      "                        '\"6d3c30d429a49321552973b81e1ef4cd3073157f\", '\n",
-      "                        '\"pull_request.locked\": false, '\n",
-      "                        '\"pull_request.assignee.login\": null, '\n",
-      "                        '\"pull_request.assignee.id\": null, '\n",
-      "                        '\"pull_request.assignee.type\": null, '\n",
-      "                        '\"pull_request.assignee.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.id\": null, '\n",
-      "                        '\"pull_request.milestone.number\": null, '\n",
-      "                        '\"pull_request.milestone.title\": null, '\n",
-      "                        '\"pull_request.milestone.description\": null, '\n",
-      "                        '\"pull_request.milestone.creator.login\": null, '\n",
-      "                        '\"pull_request.milestone.creator.id\": null, '\n",
-      "                        '\"pull_request.milestone.creator.type\": null, '\n",
-      "                        '\"pull_request.milestone.creator.site_admin\": null, '\n",
-      "                        '\"pull_request.milestone.open_issues\": null, '\n",
-      "                        '\"pull_request.milestone.closed_issues\": null, '\n",
-      "                        '\"pull_request.milestone.state\": null, '\n",
-      "                        '\"pull_request.milestone.created_at\": null, '\n",
-      "                        '\"pull_request.milestone.updated_at\": null, '\n",
-      "                        '\"pull_request.milestone.due_on\": null, '\n",
-      "                        '\"pull_request.milestone.closed_at\": null, '\n",
-      "                        '\"pull_request.merged\": true, '\n",
-      "                        '\"pull_request.mergeable\": null, '\n",
-      "                        '\"pull_request.mergeable_state\": \"unknown\", '\n",
-      "                        '\"pull_request.merged_by.login\": \"jessieay\", '\n",
-      "                        '\"pull_request.merged_by.id\": 601515, '\n",
-      "                        '\"pull_request.merged_by.type\": \"User\", '\n",
-      "                        '\"pull_request.merged_by.site_admin\": false, '\n",
-      "                        '\"pull_request.comments\": 1, '\n",
-      "                        '\"pull_request.review_comments\": 0, '\n",
-      "                        '\"pull_request.commits\": 1, \"pull_request.additions\": '\n",
-      "                        '3, \"pull_request.deletions\": 3, '\n",
-      "                        '\"pull_request.changed_files\": 1, '\n",
-      "                        '\"pull_request.label.id\": null, '\n",
-      "                        '\"pull_request.label.name\": null, '\n",
-      "                        '\"pull_request.label.color\": null, '\n",
-      "                        '\"pull_request.label.default\": null, '\n",
-      "                        '\"pull_request.head.label\": \"18F:elk-rename\", '\n",
-      "                        '\"pull_request.head.ref\": \"elk-rename\", '\n",
-      "                        '\"pull_request.head.sha\": '\n",
-      "                        '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n",
-      "                        '\"pull_request.head.user.login\": \"18F\", '\n",
-      "                        '\"pull_request.head.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.name\": \"C2\", '\n",
-      "                        '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n",
-      "                        '\"pull_request.head.repo.owner.login\": \"18F\", '\n",
-      "                        '\"pull_request.head.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.head.repo.private\": false, '\n",
-      "                        '\"pull_request.head.repo.homepage\": '\n",
-      "                        '\"https://cap.18f.gov\", '\n",
-      "                        '\"pull_request.head.repo.description\": \"an approval '\n",
-      "                        'process automation tool\", '\n",
-      "                        '\"pull_request.head.repo.fork\": false, '\n",
-      "                        '\"pull_request.head.repo.created_at\": '\n",
-      "                        '\"2014-03-28T05:15:23Z\", '\n",
-      "                        '\"pull_request.head.repo.updated_at\": '\n",
-      "                        '\"2015-11-06T02:16:44Z\", '\n",
-      "                        '\"pull_request.head.repo.pushed_at\": '\n",
-      "                        '\"2015-11-23T22:09:45Z\", '\n",
-      "                        '\"pull_request.head.repo.size\": 81440, '\n",
-      "                        '\"pull_request.head.repo.stargazers_count\": 31, '\n",
-      "                        '\"pull_request.head.repo.watchers_count\": 31, '\n",
-      "                        '\"pull_request.head.repo.language\": \"Ruby\", '\n",
-      "                        '\"pull_request.head.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.head.repo.has_projects\": null, '\n",
-      "                        '\"pull_request.head.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.head.repo.has_wiki\": false, '\n",
-      "                        '\"pull_request.head.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.head.repo.forks_count\": 16, '\n",
-      "                        '\"pull_request.head.repo.archived\": null, '\n",
-      "                        '\"pull_request.head.repo.disabled\": null, '\n",
-      "                        '\"pull_request.head.repo.open_issues_count\": 4, '\n",
-      "                        '\"pull_request.head.repo.forks\": 16, '\n",
-      "                        '\"pull_request.head.repo.open_issues\": 4, '\n",
-      "                        '\"pull_request.head.repo.watchers\": 31, '\n",
-      "                        '\"pull_request.head.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.head.repo.license.key\": null, '\n",
-      "                        '\"pull_request.head.repo.license.spdx_id\": null, '\n",
-      "                        '\"pull_request.head.repo.license.name\": null, '\n",
-      "                        '\"pull_request.base.label\": \"18F:master\", '\n",
-      "                        '\"pull_request.base.ref\": \"master\", '\n",
-      "                        '\"pull_request.base.sha\": '\n",
-      "                        '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n",
-      "                        '\"pull_request.base.user.login\": \"18F\", '\n",
-      "                        '\"pull_request.base.user.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.name\": \"C2\", '\n",
-      "                        '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n",
-      "                        '\"pull_request.base.repo.owner.login\": \"18F\", '\n",
-      "                        '\"pull_request.base.repo.owner.type\": \"Organization\", '\n",
-      "                        '\"pull_request.base.repo.private\": false, '\n",
-      "                        '\"pull_request.base.repo.homepage\": '\n",
-      "                        '\"https://cap.18f.gov\", '\n",
-      "                        '\"pull_request.base.repo.description\": \"an approval '\n",
-      "                        'process automation tool\", '\n",
-      "                        '\"pull_request.base.repo.fork\": false, '\n",
-      "                        '\"pull_request.base.repo.created_at\": '\n",
-      "                        '\"2014-03-28T05:15:23Z\", '\n",
-      "                        '\"pull_request.base.repo.updated_at\": '\n",
-      "                        '\"2015-11-06T02:16:44Z\", '\n",
-      "                        '\"pull_request.base.repo.pushed_at\": '\n",
-      "                        '\"2015-11-23T22:09:45Z\", '\n",
-      "                        '\"pull_request.base.repo.size\": 81440, '\n",
-      "                        '\"pull_request.base.repo.stargazers_count\": 31, '\n",
-      "                        '\"pull_request.base.repo.watchers_count\": 31, '\n",
-      "                        '\"pull_request.base.repo.language\": \"Ruby\", '\n",
-      "                        '\"pull_request.base.repo.has_issues\": true, '\n",
-      "                        '\"pull_request.base.repo.has_projects\": null, '\n",
-      "                        '\"pull_request.base.repo.has_downloads\": true, '\n",
-      "                        '\"pull_request.base.repo.has_wiki\": false, '\n",
-      "                        '\"pull_request.base.repo.has_pages\": false, '\n",
-      "                        '\"pull_request.base.repo.forks_count\": 16, '\n",
-      "                        '\"pull_request.base.repo.archived\": null, '\n",
-      "                        '\"pull_request.base.repo.disabled\": null, '\n",
-      "                        '\"pull_request.base.repo.open_issues_count\": 4, '\n",
-      "                        '\"pull_request.base.repo.forks\": 16, '\n",
-      "                        '\"pull_request.base.repo.open_issues\": 4, '\n",
-      "                        '\"pull_request.base.repo.watchers\": 31, '\n",
-      "                        '\"pull_request.base.repo.default_branch\": \"master\", '\n",
-      "                        '\"pull_request.base.repo.license.key\": null, '\n",
-      "                        '\"pull_request.base.repo.license.spdx_id\": null, '\n",
-      "                        '\"pull_request.base.repo.license.name\": null, '\n",
-      "                        '\"pull_request.guid\": \"18F/C2/pull/820\"}]',\n",
-      " 'pull_request.guid': '18F/C2/pull/820',\n",
-      " 'pull_request.issue_events': '{\"repo\": \"18F/C2\", \"org\": \"18F\", \"issue_id\": '\n",
-      "                              '118451607, \"issue_number\": 820, \"pull_request\": '\n",
-      "                              '{\"number\": 820.0, \"repo\": \"C2\", \"user_login\": '\n",
-      "                              '\"18F\"}, \"events\": [{\"action\": \"opened\", '\n",
-      "                              '\"author\": \"pkarman\", \"comment\": null, '\n",
-      "                              '\"comment_id\": null, \"datetime\": '\n",
-      "                              '\"2015-11-23T19:16:34Z\", \"description\": '\n",
-      "                              '\"there\\'s a bug in the cf-blue-green deploy '\n",
-      "                              'that gets a false positive match based on the '\n",
-      "                              'current ELK naming convention. I have re-named '\n",
-      "                              'all our ELK services to workaround that bug.\", '\n",
-      "                              '\"title\": \"rename elk services to workaround '\n",
-      "                              'blue-green deploy bug\", \"type\": \"issue\"}, '\n",
-      "                              '{\"action\": \"created\", \"author\": \"jessieay\", '\n",
-      "                              '\"comment\": \"wish there were a good way to write '\n",
-      "                              'tests for this type of thing...\\\\r\\\\n\\\\r\\\\nbut '\n",
-      "                              'LGTM. merging. \", \"comment_id\": 159082113.0, '\n",
-      "                              '\"datetime\": \"2015-11-23 22:09:43+00:00\", '\n",
-      "                              '\"description\": null, \"title\": null, \"type\": '\n",
-      "                              '\"comment\"}]}'}\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(small_ds[8])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 327,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "actions = []\n",
-    "c = 0\n",
-    "for events in ds[\"events\"]:\n",
-    "    c += 1\n",
-    "    actions.extend([event[\"action\"] for event in events])\n",
-    "    if c > 10000:\n",
-    "        break\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 328,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'closed', 'created', 'opened', 'reopened'}"
-      ]
-     },
-     "execution_count": 328,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "set(actions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 322,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "{'base_repo_info': {'pull_request.base.label': '1011X:master',\n",
-      "                    'pull_request.base.ref': 'master',\n",
-      "                    'pull_request.base.repo.default_branch': 'master',\n",
-      "                    'pull_request.base.repo.description': 'Representing '\n",
-      "                                                          'rational numbers '\n",
-      "                                                          'using the '\n",
-      "                                                          'floating-bar number '\n",
-      "                                                          'type.',\n",
-      "                    'pull_request.base.repo.forks_count': 2,\n",
-      "                    'pull_request.base.repo.homepage': None,\n",
-      "                    'pull_request.base.repo.language': 'Rust',\n",
-      "                    'pull_request.base.repo.license.name': 'Other',\n",
-      "                    'pull_request.base.repo.name': 'floating_bar',\n",
-      "                    'pull_request.base.repo.open_issues_count': 6,\n",
-      "                    'pull_request.base.repo.owner.login': '1011X',\n",
-      "                    'pull_request.base.repo.owner.type': 'User',\n",
-      "                    'pull_request.base.repo.private': False,\n",
-      "                    'pull_request.base.repo.stargazers_count': 15,\n",
-      "                    'pull_request.base.repo.watchers_count': 15,\n",
-      "                    'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n",
-      "                    'pull_request.base.user.login': '1011X',\n",
-      "                    'pull_request.base.user.type': 'User',\n",
-      "                    'pull_request.comments': 0,\n",
-      "                    'pull_request.label.name': None,\n",
-      "                    'pull_request.review_comments': 0},\n",
-      " 'bucket': '940',\n",
-      " 'events': [{'action': 'opened',\n",
-      "             'actor.id': None,\n",
-      "             'actor.login': None,\n",
-      "             'comment.author_association': None,\n",
-      "             'comment.body': None,\n",
-      "             'comment.commit_id': None,\n",
-      "             'comment.created_at': None,\n",
-      "             'comment.diff_hunk': None,\n",
-      "             'comment.id': None,\n",
-      "             'comment.in_reply_to_id': None,\n",
-      "             'comment.line': None,\n",
-      "             'comment.original_commit_id': None,\n",
-      "             'comment.original_line': None,\n",
-      "             'comment.original_position': None,\n",
-      "             'comment.original_start_line': None,\n",
-      "             'comment.path': None,\n",
-      "             'comment.position': None,\n",
-      "             'comment.side': None,\n",
-      "             'comment.start_line': None,\n",
-      "             'comment.start_side': None,\n",
-      "             'comment.updated_at': None,\n",
-      "             'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=<UTC>),\n",
-      "             'issue.author': 'ZoeyR',\n",
-      "             'issue.comment': None,\n",
-      "             'issue.comment_id': None,\n",
-      "             'pull_request.merged': None,\n",
-      "             'pull_request.merged_by.login': None,\n",
-      "             'pull_request.merged_by.type': None,\n",
-      "             'pull_request.state': None,\n",
-      "             'review.author_association': None,\n",
-      "             'review.body': None,\n",
-      "             'review.commit_id': None,\n",
-      "             'review.id': None,\n",
-      "             'review.state': None,\n",
-      "             'review.submitted_at': None,\n",
-      "             'type': 'issue',\n",
-      "             'user.login': None,\n",
-      "             'user.type': None},\n",
-      "            {'action': 'opened',\n",
-      "             'actor.id': 8010244,\n",
-      "             'actor.login': 'ZoeyR',\n",
-      "             'comment.author_association': None,\n",
-      "             'comment.body': None,\n",
-      "             'comment.commit_id': None,\n",
-      "             'comment.created_at': None,\n",
-      "             'comment.diff_hunk': None,\n",
-      "             'comment.id': None,\n",
-      "             'comment.in_reply_to_id': None,\n",
-      "             'comment.line': None,\n",
-      "             'comment.original_commit_id': None,\n",
-      "             'comment.original_line': None,\n",
-      "             'comment.original_position': None,\n",
-      "             'comment.original_start_line': None,\n",
-      "             'comment.path': None,\n",
-      "             'comment.position': None,\n",
-      "             'comment.side': None,\n",
-      "             'comment.start_line': None,\n",
-      "             'comment.start_side': None,\n",
-      "             'comment.updated_at': None,\n",
-      "             'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=<UTC>),\n",
-      "             'issue.author': None,\n",
-      "             'issue.comment': None,\n",
-      "             'issue.comment_id': None,\n",
-      "             'pull_request.merged': False,\n",
-      "             'pull_request.merged_by.login': None,\n",
-      "             'pull_request.merged_by.type': None,\n",
-      "             'pull_request.state': 'open',\n",
-      "             'review.author_association': None,\n",
-      "             'review.body': None,\n",
-      "             'review.commit_id': None,\n",
-      "             'review.id': None,\n",
-      "             'review.state': None,\n",
-      "             'review.submitted_at': None,\n",
-      "             'type': 'PullRequestEvent',\n",
-      "             'user.login': None,\n",
-      "             'user.type': None},\n",
-      "            {'action': 'created',\n",
-      "             'actor.id': None,\n",
-      "             'actor.login': None,\n",
-      "             'comment.author_association': None,\n",
-      "             'comment.body': None,\n",
-      "             'comment.commit_id': None,\n",
-      "             'comment.created_at': None,\n",
-      "             'comment.diff_hunk': None,\n",
-      "             'comment.id': None,\n",
-      "             'comment.in_reply_to_id': None,\n",
-      "             'comment.line': None,\n",
-      "             'comment.original_commit_id': None,\n",
-      "             'comment.original_line': None,\n",
-      "             'comment.original_position': None,\n",
-      "             'comment.original_start_line': None,\n",
-      "             'comment.path': None,\n",
-      "             'comment.position': None,\n",
-      "             'comment.side': None,\n",
-      "             'comment.start_line': None,\n",
-      "             'comment.start_side': None,\n",
-      "             'comment.updated_at': None,\n",
-      "             'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=<UTC>),\n",
-      "             'issue.author': '1011X',\n",
-      "             'issue.comment': 'LGTM, thank you!',\n",
-      "             'issue.comment_id': 835503633.0,\n",
-      "             'pull_request.merged': None,\n",
-      "             'pull_request.merged_by.login': None,\n",
-      "             'pull_request.merged_by.type': None,\n",
-      "             'pull_request.state': None,\n",
-      "             'review.author_association': None,\n",
-      "             'review.body': None,\n",
-      "             'review.commit_id': None,\n",
-      "             'review.id': None,\n",
-      "             'review.state': None,\n",
-      "             'review.submitted_at': None,\n",
-      "             'type': 'comment',\n",
-      "             'user.login': None,\n",
-      "             'user.type': None},\n",
-      "            {'action': 'closed',\n",
-      "             'actor.id': 1851619,\n",
-      "             'actor.login': '1011X',\n",
-      "             'comment.author_association': None,\n",
-      "             'comment.body': None,\n",
-      "             'comment.commit_id': None,\n",
-      "             'comment.created_at': None,\n",
-      "             'comment.diff_hunk': None,\n",
-      "             'comment.id': None,\n",
-      "             'comment.in_reply_to_id': None,\n",
-      "             'comment.line': None,\n",
-      "             'comment.original_commit_id': None,\n",
-      "             'comment.original_line': None,\n",
-      "             'comment.original_position': None,\n",
-      "             'comment.original_start_line': None,\n",
-      "             'comment.path': None,\n",
-      "             'comment.position': None,\n",
-      "             'comment.side': None,\n",
-      "             'comment.start_line': None,\n",
-      "             'comment.start_side': None,\n",
-      "             'comment.updated_at': None,\n",
-      "             'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=<UTC>),\n",
-      "             'issue.author': None,\n",
-      "             'issue.comment': None,\n",
-      "             'issue.comment_id': None,\n",
-      "             'pull_request.merged': True,\n",
-      "             'pull_request.merged_by.login': '1011X',\n",
-      "             'pull_request.merged_by.type': 'User',\n",
-      "             'pull_request.state': 'closed',\n",
-      "             'review.author_association': None,\n",
-      "             'review.body': None,\n",
-      "             'review.commit_id': None,\n",
-      "             'review.id': None,\n",
-      "             'review.state': None,\n",
-      "             'review.submitted_at': None,\n",
-      "             'type': 'PullRequestEvent',\n",
-      "             'user.login': None,\n",
-      "             'user.type': None}],\n",
-      " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n",
-      "                    'pull_request.head.ref': 'fractional-benches',\n",
-      "                    'pull_request.head.repo.default_branch': 'master',\n",
-      "                    'pull_request.head.repo.description': 'Representing '\n",
-      "                                                          'rational numbers '\n",
-      "                                                          'using the '\n",
-      "                                                          'floating-bar number '\n",
-      "                                                          'type.',\n",
-      "                    'pull_request.head.repo.homepage': None,\n",
-      "                    'pull_request.head.repo.language': None,\n",
-      "                    'pull_request.head.repo.license.name': 'Other',\n",
-      "                    'pull_request.head.repo.name': 'floating_bar',\n",
-      "                    'pull_request.head.repo.owner.login': 'ZoeyR',\n",
-      "                    'pull_request.head.repo.owner.type': 'User',\n",
-      "                    'pull_request.head.repo.private': False,\n",
-      "                    'pull_request.head.repo.stargazers_count': 0,\n",
-      "                    'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n",
-      "                    'pull_request.head.user.login': 'ZoeyR',\n",
-      "                    'pull_request.head.user.type': 'User'},\n",
-      " 'pull_request_info': {'org.id': None,\n",
-      "                       'public': True,\n",
-      "                       'pull_request.additions': 23,\n",
-      "                       'pull_request.base.user.type': 'User',\n",
-      "                       'pull_request.body': '',\n",
-      "                       'pull_request.changed_files': 4,\n",
-      "                       'pull_request.closed_at': None,\n",
-      "                       'pull_request.comments': 0,\n",
-      "                       'pull_request.commits': 1,\n",
-      "                       'pull_request.created_at': '2021-05-08T20:30:31Z',\n",
-      "                       'pull_request.deletions': 19,\n",
-      "                       'pull_request.guid': '1011X/floating_bar/pull/7',\n",
-      "                       'pull_request.head.user.type': 'User',\n",
-      "                       'pull_request.id': 634875503,\n",
-      "                       'pull_request.merged_at': None,\n",
-      "                       'pull_request.merged_by.login': None,\n",
-      "                       'pull_request.milestone.description': None,\n",
-      "                       'pull_request.milestone.number': None,\n",
-      "                       'pull_request.milestone.title': None,\n",
-      "                       'pull_request.number': 7,\n",
-      "                       'pull_request.review_comments': 0,\n",
-      "                       'pull_request.state': 'open',\n",
-      "                       'pull_request.title': 'change benches to use fractional '\n",
-      "                                             'values',\n",
-      "                       'pull_request.user.id': 8010244,\n",
-      "                       'pull_request.user.login': 'ZoeyR',\n",
-      "                       'repo.id': 166723951,\n",
-      "                       'repo.name': '1011X/floating_bar'}}\n"
-     ]
-    }
-   ],
-   "source": [
-    "pprint(ds[6])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 318,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'bucket': '940',\n",
-       " 'pull_request_info': {'org.id': None,\n",
-       "  'public': True,\n",
-       "  'pull_request.additions': 23,\n",
-       "  'pull_request.base.user.type': 'User',\n",
-       "  'pull_request.body': '',\n",
-       "  'pull_request.changed_files': 4,\n",
-       "  'pull_request.closed_at': None,\n",
-       "  'pull_request.comments': 0,\n",
-       "  'pull_request.commits': 1,\n",
-       "  'pull_request.created_at': '2021-05-08T20:30:31Z',\n",
-       "  'pull_request.deletions': 19,\n",
-       "  'pull_request.guid': '1011X/floating_bar/pull/7',\n",
-       "  'pull_request.head.user.type': 'User',\n",
-       "  'pull_request.id': 634875503,\n",
-       "  'pull_request.merged_at': None,\n",
-       "  'pull_request.merged_by.login': None,\n",
-       "  'pull_request.milestone.description': None,\n",
-       "  'pull_request.milestone.number': None,\n",
-       "  'pull_request.milestone.title': None,\n",
-       "  'pull_request.number': 7,\n",
-       "  'pull_request.review_comments': 0,\n",
-       "  'pull_request.state': 'open',\n",
-       "  'pull_request.title': 'change benches to use fractional values',\n",
-       "  'pull_request.user.id': 8010244,\n",
-       "  'pull_request.user.login': 'ZoeyR',\n",
-       "  'repo.id': 166723951,\n",
-       "  'repo.name': '1011X/floating_bar'},\n",
-       " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n",
-       "  'pull_request.head.ref': 'fractional-benches',\n",
-       "  'pull_request.head.repo.default_branch': 'master',\n",
-       "  'pull_request.head.repo.description': 'Representing rational numbers using the floating-bar number type.',\n",
-       "  'pull_request.head.repo.homepage': None,\n",
-       "  'pull_request.head.repo.language': None,\n",
-       "  'pull_request.head.repo.license.name': 'Other',\n",
-       "  'pull_request.head.repo.name': 'floating_bar',\n",
-       "  'pull_request.head.repo.owner.login': 'ZoeyR',\n",
-       "  'pull_request.head.repo.owner.type': 'User',\n",
-       "  'pull_request.head.repo.private': False,\n",
-       "  'pull_request.head.repo.stargazers_count': 0,\n",
-       "  'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n",
-       "  'pull_request.head.user.login': 'ZoeyR',\n",
-       "  'pull_request.head.user.type': 'User'},\n",
-       " 'base_repo_info': {'pull_request.base.label': '1011X:master',\n",
-       "  'pull_request.base.ref': 'master',\n",
-       "  'pull_request.base.repo.default_branch': 'master',\n",
-       "  'pull_request.base.repo.description': 'Representing rational numbers using the floating-bar number type.',\n",
-       "  'pull_request.base.repo.forks_count': 2,\n",
-       "  'pull_request.base.repo.homepage': None,\n",
-       "  'pull_request.base.repo.language': 'Rust',\n",
-       "  'pull_request.base.repo.license.name': 'Other',\n",
-       "  'pull_request.base.repo.name': 'floating_bar',\n",
-       "  'pull_request.base.repo.open_issues_count': 6,\n",
-       "  'pull_request.base.repo.owner.login': '1011X',\n",
-       "  'pull_request.base.repo.owner.type': 'User',\n",
-       "  'pull_request.base.repo.private': False,\n",
-       "  'pull_request.base.repo.stargazers_count': 15,\n",
-       "  'pull_request.base.repo.watchers_count': 15,\n",
-       "  'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n",
-       "  'pull_request.base.user.login': '1011X',\n",
-       "  'pull_request.base.user.type': 'User',\n",
-       "  'pull_request.comments': 0,\n",
-       "  'pull_request.label.name': None,\n",
-       "  'pull_request.review_comments': 0},\n",
-       " 'events': [{'action': 'opened',\n",
-       "   'actor.id': None,\n",
-       "   'actor.login': None,\n",
-       "   'comment.author_association': None,\n",
-       "   'comment.body': None,\n",
-       "   'comment.commit_id': None,\n",
-       "   'comment.created_at': None,\n",
-       "   'comment.diff_hunk': None,\n",
-       "   'comment.id': None,\n",
-       "   'comment.in_reply_to_id': None,\n",
-       "   'comment.line': None,\n",
-       "   'comment.original_commit_id': None,\n",
-       "   'comment.original_line': None,\n",
-       "   'comment.original_position': None,\n",
-       "   'comment.original_start_line': None,\n",
-       "   'comment.path': None,\n",
-       "   'comment.position': None,\n",
-       "   'comment.side': None,\n",
-       "   'comment.start_line': None,\n",
-       "   'comment.start_side': None,\n",
-       "   'comment.updated_at': None,\n",
-       "   'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=<UTC>),\n",
-       "   'issue.author': 'ZoeyR',\n",
-       "   'issue.comment': None,\n",
-       "   'issue.comment_id': None,\n",
-       "   'pull_request.merged': None,\n",
-       "   'pull_request.merged_by.login': None,\n",
-       "   'pull_request.merged_by.type': None,\n",
-       "   'pull_request.state': None,\n",
-       "   'review.author_association': None,\n",
-       "   'review.body': None,\n",
-       "   'review.commit_id': None,\n",
-       "   'review.id': None,\n",
-       "   'review.state': None,\n",
-       "   'review.submitted_at': None,\n",
-       "   'type': 'issue',\n",
-       "   'user.login': None,\n",
-       "   'user.type': None},\n",
-       "  {'action': 'opened',\n",
-       "   'actor.id': 8010244,\n",
-       "   'actor.login': 'ZoeyR',\n",
-       "   'comment.author_association': None,\n",
-       "   'comment.body': None,\n",
-       "   'comment.commit_id': None,\n",
-       "   'comment.created_at': None,\n",
-       "   'comment.diff_hunk': None,\n",
-       "   'comment.id': None,\n",
-       "   'comment.in_reply_to_id': None,\n",
-       "   'comment.line': None,\n",
-       "   'comment.original_commit_id': None,\n",
-       "   'comment.original_line': None,\n",
-       "   'comment.original_position': None,\n",
-       "   'comment.original_start_line': None,\n",
-       "   'comment.path': None,\n",
-       "   'comment.position': None,\n",
-       "   'comment.side': None,\n",
-       "   'comment.start_line': None,\n",
-       "   'comment.start_side': None,\n",
-       "   'comment.updated_at': None,\n",
-       "   'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=<UTC>),\n",
-       "   'issue.author': None,\n",
-       "   'issue.comment': None,\n",
-       "   'issue.comment_id': None,\n",
-       "   'pull_request.merged': False,\n",
-       "   'pull_request.merged_by.login': None,\n",
-       "   'pull_request.merged_by.type': None,\n",
-       "   'pull_request.state': 'open',\n",
-       "   'review.author_association': None,\n",
-       "   'review.body': None,\n",
-       "   'review.commit_id': None,\n",
-       "   'review.id': None,\n",
-       "   'review.state': None,\n",
-       "   'review.submitted_at': None,\n",
-       "   'type': 'PullRequestEvent',\n",
-       "   'user.login': None,\n",
-       "   'user.type': None},\n",
-       "  {'action': 'created',\n",
-       "   'actor.id': None,\n",
-       "   'actor.login': None,\n",
-       "   'comment.author_association': None,\n",
-       "   'comment.body': None,\n",
-       "   'comment.commit_id': None,\n",
-       "   'comment.created_at': None,\n",
-       "   'comment.diff_hunk': None,\n",
-       "   'comment.id': None,\n",
-       "   'comment.in_reply_to_id': None,\n",
-       "   'comment.line': None,\n",
-       "   'comment.original_commit_id': None,\n",
-       "   'comment.original_line': None,\n",
-       "   'comment.original_position': None,\n",
-       "   'comment.original_start_line': None,\n",
-       "   'comment.path': None,\n",
-       "   'comment.position': None,\n",
-       "   'comment.side': None,\n",
-       "   'comment.start_line': None,\n",
-       "   'comment.start_side': None,\n",
-       "   'comment.updated_at': None,\n",
-       "   'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=<UTC>),\n",
-       "   'issue.author': '1011X',\n",
-       "   'issue.comment': 'LGTM, thank you!',\n",
-       "   'issue.comment_id': 835503633.0,\n",
-       "   'pull_request.merged': None,\n",
-       "   'pull_request.merged_by.login': None,\n",
-       "   'pull_request.merged_by.type': None,\n",
-       "   'pull_request.state': None,\n",
-       "   'review.author_association': None,\n",
-       "   'review.body': None,\n",
-       "   'review.commit_id': None,\n",
-       "   'review.id': None,\n",
-       "   'review.state': None,\n",
-       "   'review.submitted_at': None,\n",
-       "   'type': 'comment',\n",
-       "   'user.login': None,\n",
-       "   'user.type': None},\n",
-       "  {'action': 'closed',\n",
-       "   'actor.id': 1851619,\n",
-       "   'actor.login': '1011X',\n",
-       "   'comment.author_association': None,\n",
-       "   'comment.body': None,\n",
-       "   'comment.commit_id': None,\n",
-       "   'comment.created_at': None,\n",
-       "   'comment.diff_hunk': None,\n",
-       "   'comment.id': None,\n",
-       "   'comment.in_reply_to_id': None,\n",
-       "   'comment.line': None,\n",
-       "   'comment.original_commit_id': None,\n",
-       "   'comment.original_line': None,\n",
-       "   'comment.original_position': None,\n",
-       "   'comment.original_start_line': None,\n",
-       "   'comment.path': None,\n",
-       "   'comment.position': None,\n",
-       "   'comment.side': None,\n",
-       "   'comment.start_line': None,\n",
-       "   'comment.start_side': None,\n",
-       "   'comment.updated_at': None,\n",
-       "   'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=<UTC>),\n",
-       "   'issue.author': None,\n",
-       "   'issue.comment': None,\n",
-       "   'issue.comment_id': None,\n",
-       "   'pull_request.merged': True,\n",
-       "   'pull_request.merged_by.login': '1011X',\n",
-       "   'pull_request.merged_by.type': 'User',\n",
-       "   'pull_request.state': 'closed',\n",
-       "   'review.author_association': None,\n",
-       "   'review.body': None,\n",
-       "   'review.commit_id': None,\n",
-       "   'review.id': None,\n",
-       "   'review.state': None,\n",
-       "   'review.submitted_at': None,\n",
-       "   'type': 'PullRequestEvent',\n",
-       "   'user.login': None,\n",
-       "   'user.type': None}]}"
-      ]
-     },
-     "execution_count": 318,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sample = ds[6]\n",
-    "sample"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sample = ds[0]\n",
-    "pr_info = sample[\"pull_request_info\"]\n",
-    "head_info = sample[\"head_repo_info\"]\n",
-    "base_info = sample[\"base_repo_info\"]\n",
-    "events = sample[\"events\"]\n",
-    "\n",
-    "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
-    "\n",
-    "header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}<br>\n",
-    "📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.<br>\n",
-    "Link: [{gh_link}]({gh_link})\"\"\"\n",
-    "pr_info_html = f\"\"\"\n",
-    "<table style=\"width:100%\">\n",
-    "    <tr><th>Attribute</th><th>Detail</th></tr>\n",
-    "    <tr><td>🧾 <strong>PR Type</strong></td><td>{events[0]['type']}</td></tr>\n",
-    "    <tr><td>🟢 <strong>PR State</strong></td><td>{pr_info['pull_request.state']}</td></tr>\n",
-    "    <tr><td>👤 <strong>PR Author</strong></td><td>{pr_info['pull_request.user.login']}</td></tr>\n",
-    "    <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}</td></tr>\n",
-    "    <tr><td>🌳 <strong>Base Branch</strong></td><td>{base_info['pull_request.base.ref']}</td></tr>\n",
-    "</table>\n",
-    "\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "<table style=\"width:100%\">\n",
-       "    <tr><th>Attribute</th><th>Detail</th></tr>\n",
-       "    <tr><td>🧾 <strong>PR Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "    <tr><td>🟢 <strong>PR State</strong></td><td>open</td></tr>\n",
-       "    <tr><td>👤 <strong>PR Author</strong></td><td>dependabot[bot]</td></tr>\n",
-       "    <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: dependabot/npm_and_yarn/qs-6.5.3, label: AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3</td></tr>\n",
-       "    <tr><td>🌳 <strong>Base Branch</strong></td><td>master</td></tr>\n",
-       "</table>\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# display pr_info_html as HTML\n",
-    "from IPython.display import HTML, display\n",
-    "display(HTML(pr_info_html))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>action</th>\n",
-       "      <th>comments</th>\n",
-       "      <th>created_at</th>\n",
-       "      <th>type</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>opened</td>\n",
-       "      <td>{'actor.id': 49699333, 'actor.login': 'dependa...</td>\n",
-       "      <td>2022-12-10 03:27:08+00:00</td>\n",
-       "      <td>PullRequestEvent</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   action                                           comments   \n",
-       "0  opened  {'actor.id': 49699333, 'actor.login': 'dependa...  \\\n",
-       "\n",
-       "                 created_at              type  \n",
-       "0 2022-12-10 03:27:08+00:00  PullRequestEvent  "
-      ]
-     },
-     "execution_count": 75,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = pd.DataFrame(events)\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div class=\"thread\">\n",
-       "    <div class=\"event\">\n",
-       "        <table style=\"width:100%; border: 0;\">\n",
-       "            <tr><td><strong>Event Type</strong></td><td>PullRequestEvent</td></tr>\n",
-       "            <tr><td><strong>User</strong></td><td>None (type :<strong>None</strong>)</td></tr>\n",
-       "            <tr><td><strong>Review State</strong></td><td>None</td></tr>\n",
-       "            <tr><td><strong>From Head</strong></td><td>AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3</td></tr>\n",
-       "        </table>\n",
-       "    </div>\n",
-       "    </div>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "def create_grouped_events(events):\n",
-    "    df = pd.DataFrame(events)\n",
-    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "    df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n",
-    "    # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n",
-    "    if len(df) == 1:\n",
-    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
-    "    else:\n",
-    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n",
-    "    return grouped_events\n",
-    "    \n",
-    "events = sample[\"events\"]\n",
-    "grouped_events = create_grouped_events(events)\n",
-    "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
-    "for thread in grouped_events:\n",
-    "    # Start a new thread\n",
-    "    thread_html = '<div class=\"thread\">'\n",
-    "    # Get the first event in the thread as a reference\n",
-    "    first_event = thread[0]\n",
-    "    \n",
-    "    # Add shared parts of the events only once\n",
-    "    text = f\"\"\"\n",
-    "    <div class=\"event\">\n",
-    "        <table style=\"width:100%; border: 0;\">\n",
-    "            <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
-    "            <tr><td><strong>User</strong></td><td>{first_event['user.login']} (type :<strong>{first_event['user.type']}</strong>)</td></tr>\n",
-    "            <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
-    "            <tr><td><strong>From Head</strong></td><td>{head_info['pull_request.head.label']}</td></tr>\n",
-    "        </table>\n",
-    "    </div>\n",
-    "    \"\"\"\n",
-    "    thread_html += text\n",
-    "    \n",
-    "    # Add the bodies of the comments for each event in the thread\n",
-    "    for event in thread:\n",
-    "        if event['comment.body']:\n",
-    "            is_op = original_poster == event['user.login']\n",
-    "            thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n",
-    "    thread_html += '</div>'\n",
-    "    display(HTML(thread_html))\n",
-    "    if first_event['comment.path']:\n",
-    "        path_html = f\"<code style='font-family: monospace; font-weight: bold;'>Path:</code> {first_event['comment.path']}\"\n",
-    "        display(HTML(path_html))\n",
-    "    if first_event[\"comment.diff_hunk\"]:\n",
-    "        print(first_event[\"comment.diff_hunk\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'dependabot[bot]'"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sample[\"pull_request_info\"]['pull_request.user.login']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "import pandas as pd\n",
-    "import ghdiff\n",
-    "import streamlit as st\n",
-    "import streamlit.components.v1 as components\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "\n",
-    "# save dataset as in \"bigcode/code_reviews_sample\"\n",
-    "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n",
-    "size = len(ds)\n",
-    "\n",
-    "def show_diff_hunk(diff_hunk, position, context=5):\n",
-    "    # exclude the first line with the @@ notation\n",
-    "    lines = diff_hunk.split('\\n')\n",
-    "    start_line = max(int(position) - context - 1, 0)\n",
-    "    end_line = int(position)\n",
-    "    actual_diff = lines[0] + '\\n' + '\\n'.join(lines[start_line + 1:end_line + 1])\n",
-    "    focus = ghdiff.colorize(actual_diff)\n",
-    "    full = ghdiff.colorize(diff_hunk)\n",
-    "    # Wrap the diff hunk inside a scrollable div\n",
-    "    scrollable_focus = f'<div style=\"height:500px;overflow:auto;\">{focus}</div>'\n",
-    "    scrollable_full = f'<div style=\"height:500px;overflow:auto;\">{full}</div>'\n",
-    "    if len(lines) <= 12:\n",
-    "        return None, scrollable_full\n",
-    "    return scrollable_focus, scrollable_full\n",
-    "\n",
-    "\n",
-    "def format_body(text, user, is_op=False):\n",
-    "    color = \"#007bff\" if is_op else \"black\"\n",
-    "    pr_body = f\"<div style='background-color: #f0f0f0; padding: 10px;'>👤<strong style='color: {color};'>{user}</strong>: {text}</div>\"\n",
-    "    return pr_body\n",
-    "\n",
-    "\n",
-    "def create_grouped_events(events):\n",
-    "    df = pd.DataFrame(events)\n",
-    "    df['created_at'] = pd.to_datetime(df['created_at'])\n",
-    "    df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n",
-    "    # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n",
-    "    if len(df) == 1:\n",
-    "        grouped_events = [[df.iloc[0].to_dict()]]\n",
-    "    else:\n",
-    "        grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n",
-    "    return grouped_events\n",
-    "\n",
-    "\n",
-    "def get_pr_info(sample):\n",
-    "    pr_info = sample[\"pull_request_info\"]\n",
-    "    head_info = sample[\"head_repo_info\"]\n",
-    "    base_info = sample[\"base_repo_info\"]\n",
-    "    events = sample[\"events\"]\n",
-    "\n",
-    "    gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
-    "    \n",
-    "    header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}<br>\n",
-    "    📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.<br>\n",
-    "    Link: [{gh_link}]({gh_link})\"\"\"\n",
-    "    pr_info_html = f\"\"\"\n",
-    "    <table style=\"width:100%\">\n",
-    "        <tr><th>Attribute</th><th>Detail</th></tr>\n",
-    "        <tr><td>🧾 <strong>PR Type</strong></td><td>{events[0]['type']}</td></tr>\n",
-    "        <tr><td>🟢 <strong>PR State</strong></td><td>{pr_info['pull_request.state']}</td></tr>\n",
-    "        <tr><td>👤 <strong>PR Author</strong></td><td>{pr_info['pull_request.user.login']}</td></tr>\n",
-    "        <tr><td>🏷️ <strong>Head Branch</strong></td><td>ref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}</td></tr>\n",
-    "        <tr><td>🌳 <strong>Base Branch</strong></td><td>{base_info['pull_request.base.ref']}</td></tr>\n",
-    "    </table>\n",
-    "    \"\"\"\n",
-    "    return header, pr_info_html\n",
-    "\n",
-    "\n",
-    "def display_events(sample):\n",
-    "    events = sample[\"events\"]\n",
-    "    grouped_events = create_grouped_events(events)\n",
-    "    original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n",
-    "    for thread in grouped_events:\n",
-    "        # Start a new thread\n",
-    "        thread_html = '<div class=\"thread\">'\n",
-    "        # Get the first event in the thread as a reference\n",
-    "        first_event = thread[0]\n",
-    "        \n",
-    "        # Add shared parts of the events only once\n",
-    "        text = f\"\"\"\n",
-    "        <div class=\"event\">\n",
-    "            <table style=\"width:100%; border: 0;\">\n",
-    "                <tr><td><strong>Event Type</strong></td><td>{first_event['type']}</td></tr>\n",
-    "                <tr><td><strong>User</strong></td><td>{first_event['user.login']} (type :<strong>{first_event['user.type']}</strong>)</td></tr>\n",
-    "                <tr><td><strong>Review State</strong></td><td>{first_event['review.state']}</td></tr>\n",
-    "                <tr><td><strong>From Head</strong></td><td>{first_event['pull_request.head.label']}</td></tr>\n",
-    "            </table>\n",
-    "        </div>\n",
-    "        \"\"\"\n",
-    "        thread_html += text\n",
-    "        \n",
-    "        # Add the bodies of the comments for each event in the thread\n",
-    "        for event in thread:\n",
-    "            if event['comment.body']:\n",
-    "                is_op = original_poster == event['user.login']\n",
-    "                thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n",
-    "        thread_html += '</div>'\n",
-    "        st.markdown(thread_html, unsafe_allow_html=True)\n",
-    "        if first_event['comment.path']:\n",
-    "            path_html = f\"<code style='font-family: monospace; font-weight: bold;'>Path:</code> {first_event['comment.path']}\"\n",
-    "            st.markdown(path_html, unsafe_allow_html=True)\n",
-    "        if first_event[\"comment.diff_hunk\"]:\n",
-    "            focus_diff, full_diff = show_diff_hunk(first_event[\"comment.diff_hunk\"], first_event[\"comment.original_position\"])\n",
-    "            if not focus_diff:\n",
-    "                components.html(full_diff)\n",
-    "            else:\n",
-    "                components.html(focus_diff)\n",
-    "                with st.expander(\"View Full diff hunk\"):\n",
-    "                    components.html(full_diff)\n",
-    "        st.markdown(\"---\")\n",
-    "\n",
-    "def custom_css():\n",
-    "    st.markdown(\"\"\"\n",
-    "    <style>\n",
-    "        .thread {\n",
-    "            border: 1px solid #ccc;\n",
-    "            padding: 10px;\n",
-    "            margin: 10px;\n",
-    "            background-color: #f9f9f9;\n",
-    "            width: 100%;\n",
-    "        }\n",
-    "        .event {\n",
-    "            border-bottom: 1px solid #eee;\n",
-    "            padding: 10px;\n",
-    "            width: 100%;\n",
-    "        }\n",
-    "        .event-header {\n",
-    "            font-size: 0.9em;\n",
-    "            color: #666;\n",
-    "        }\n",
-    "    </style>\n",
-    "    \"\"\", unsafe_allow_html=True)\n",
-    "\n",
-    "custom_css()\n",
-    "\n",
-    "\n",
-    "#st.set_page_config(page_icon=\":laptop:\", layout=\"wide\")\n",
-    "st.markdown(f\"\"\"\\\n",
-    "    # GitHub Code Reviews Inspection 🔍\n",
-    "    In this space you can inspect code reviews from GitHUb Pull Requests. Note that some may have empty text (e.g approval of a PR without a code comment).\n",
-    "    You can find the dataset at [bigcode/code_reviews_sample](https://huggingface.co/datasets/bigcode/code_reviews_sample)\n",
-    "    \"\"\"\n",
-    "    )\n",
-    "example_index = st.number_input(f\"Example (0 to {size-1}):\", min_value=0, max_value=size-1, value=0, step=1)\n",
-    "\n",
-    "header, pr_info_html = get_pr_info(ds[example_index])\n",
-    "st.subheader(\"PR information\")\n",
-    "st.markdown(header, unsafe_allow_html=True)\n",
-    "st.markdown(pr_info_html, unsafe_allow_html=True)\n",
-    "st.markdown(\"<br>\", unsafe_allow_html=True)\n",
-    "st.subheader(\"Code review events\")\n",
-    "event_blocks = display_events(ds[example_index])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ValueError: The features can't be aligned because the key pull_request_info of features {'pull_request.guid': Value(dtype='string', id=None), 'pull_request.code_review_events': Value(dtype='string', id=None), 'pull_request.events': Value(dtype='string', id=None), 'pull_request.issue_events': Value(dtype='string', id=None), 'bucket': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'pull_request_info': {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)}, 'head_repo_info': {'pull_request.head.label': Value(dtype='string', id=None), 'pull_request.head.ref': Value(dtype='string', id=None), 'pull_request.head.repo.default_branch': Value(dtype='string', id=None), 'pull_request.head.repo.description': Value(dtype='null', id=None), 'pull_request.head.repo.homepage': Value(dtype='null', id=None), 'pull_request.head.repo.language': Value(dtype='string', id=None), 'pull_request.head.repo.license.name': Value(dtype='null', id=None), 'pull_request.head.repo.name': Value(dtype='string', id=None), 'pull_request.head.repo.owner.login': Value(dtype='string', id=None), 'pull_request.head.repo.owner.type': Value(dtype='string', id=None), 'pull_request.head.repo.private': Value(dtype='bool', id=None), 'pull_request.head.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.head.sha': Value(dtype='string', id=None), 'pull_request.head.user.login': Value(dtype='string', id=None), 'pull_request.head.user.type': Value(dtype='string', id=None)}, 'base_repo_info': {'pull_request.base.label': Value(dtype='string', id=None), 'pull_request.base.ref': Value(dtype='string', id=None), 'pull_request.base.repo.default_branch': Value(dtype='string', id=None), 'pull_request.base.repo.description': Value(dtype='null', id=None), 'pull_request.base.repo.forks_count': Value(dtype='int64', id=None), 'pull_request.base.repo.homepage': Value(dtype='null', id=None), 'pull_request.base.repo.language': Value(dtype='string', id=None), 'pull_request.base.repo.license.name': Value(dtype='null', id=None), 'pull_request.base.repo.name': Value(dtype='string', id=None), 'pull_request.base.repo.open_issues_count': Value(dtype='int64', id=None), 'pull_request.base.repo.owner.login': Value(dtype='string', id=None), 'pull_request.base.repo.owner.type': Value(dtype='string', id=None), 'pull_request.base.repo.private': Value(dtype='bool', id=None), 'pull_request.base.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.base.repo.watchers_count': Value(dtype='int64', id=None), 'pull_request.base.sha': Value(dtype='string', id=None), 'pull_request.base.user.login': Value(dtype='string', id=None), 'pull_request.base.user.type': Value(dtype='string', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.label.name': Value(dtype='null', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None)}, 'events': [{'action': Value(dtype='string', id=None), 'created_at': Value(dtype='timestamp[us, tz=UTC]', id=None), 'issues_comments': {'action': Value(dtype='string', id=None), 'author': Value(dtype='null', id=None), 'comment': Value(dtype='null', id=None), 'comment_id': Value(dtype='null', id=None), 'datetime': Value(dtype='null', id=None), 'type': Value(dtype='string', id=None)}, 'review_comments': {'actor.id': Value(dtype='int64', id=None), 'actor.login': Value(dtype='string', id=None), 'comment.author_association': Value(dtype='null', id=None), 'comment.body': Value(dtype='null', id=None), 'comment.commit_id': Value(dtype='null', id=None), 'comment.created_at': Value(dtype='null', id=None), 'comment.diff_hunk': Value(dtype='null', id=None), 'comment.id': Value(dtype='null', id=None), 'comment.in_reply_to_id': Value(dtype='null', id=None), 'comment.line': Value(dtype='null', id=None), 'comment.original_commit_id': Value(dtype='null', id=None), 'comment.original_line': Value(dtype='null', id=None), 'comment.original_position': Value(dtype='null', id=None), 'comment.original_start_line': Value(dtype='null', id=None), 'comment.path': Value(dtype='null', id=None), 'comment.position': Value(dtype='null', id=None), 'comment.side': Value(dtype='null', id=None), 'comment.start_line': Value(dtype='null', id=None), 'comment.start_side': Value(dtype='null', id=None), 'comment.updated_at': Value(dtype='null', id=None), 'review.author_association': Value(dtype='null', id=None), 'review.body': Value(dtype='null', id=None), 'review.commit_id': Value(dtype='null', id=None), 'review.id': Value(dtype='null', id=None), 'review.state': Value(dtype='null', id=None), 'review.submitted_at': Value(dtype='null', id=None), 'user.login': Value(dtype='null', id=None), 'user.type': Value(dtype='null', id=None)}, 'type': Value(dtype='string', id=None)}]} has unexpected type - {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} (expected either {'org.id': Value(dtype='null', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} or Value(\"null\").\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'type': 'PullRequestEvent',\n",
-       "  'action': 'opened',\n",
-       "  'actor.login': 'dependabot[bot]',\n",
-       "  'actor.id': 49699333,\n",
-       "  'user.login': None,\n",
-       "  'user.id': None,\n",
-       "  'user.type': None,\n",
-       "  'repo.name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n",
-       "  'repo.id': 210433834,\n",
-       "  'public': True,\n",
-       "  'created_at': '2022-12-10T03:27:08Z',\n",
-       "  'org.id': None,\n",
-       "  'org.login': None,\n",
-       "  'pull_request.id': 1157080683,\n",
-       "  'pull_request.number': 35,\n",
-       "  'pull_request.state': 'open',\n",
-       "  'pull_request.title': 'Bump qs from 6.5.2 to 6.5.3',\n",
-       "  'pull_request.body': 'Bumps [qs](https://github.com/ljharb/qs) from 6.5.2 to 6.5.3.\\n<details>\\n<summary>Changelog</summary>\\n<p><em>Sourced from <a href=\"https://github.com/ljharb/qs/blob/main/CHANGELOG.md\">qs\\'s changelog</a>.</em></p>\\n<blockquote>\\n<h2><strong>6.5.3</strong></h2>\\n<ul>\\n<li>[Fix] <code>parse</code>: ignore <code>__proto__</code> keys (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/428\">#428</a>)</li>\\n<li>[Fix] <code>utils.merge</code>: avoid a crash with a null target and a truthy non-array source</li>\\n<li>[Fix] correctly parse nested arrays</li>\\n<li>[Fix] <code>stringify</code>: fix a crash with <code>strictNullHandling</code> and a custom <code>filter</code>/<code>serializeDate</code> (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/279\">#279</a>)</li>\\n<li>[Fix] <code>utils</code>: <code>merge</code>: fix crash when <code>source</code> is a truthy primitive &amp; no options are provided</li>\\n<li>[Fix] when <code>parseArrays</code> is false, properly handle keys ending in <code>[]</code></li>\\n<li>[Fix] fix for an impossible situation: when the formatter is called with a non-string value</li>\\n<li>[Fix] <code>utils.merge</code>: avoid a crash with a null target and an array source</li>\\n<li>[Refactor] <code>utils</code>: reduce observable [[Get]]s</li>\\n<li>[Refactor] use cached <code>Array.isArray</code></li>\\n<li>[Refactor] <code>stringify</code>: Avoid arr = arr.concat(...), push to the existing instance (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/269\">#269</a>)</li>\\n<li>[Refactor] <code>parse</code>: only need to reassign the var once</li>\\n<li>[Robustness] <code>stringify</code>: avoid relying on a global <code>undefined</code> (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/427\">#427</a>)</li>\\n<li>[readme] remove travis badge; add github actions/codecov badges; update URLs</li>\\n<li>[Docs] Clean up license text so it’s properly detected as BSD-3-Clause</li>\\n<li>[Docs] Clarify the need for &quot;arrayLimit&quot; option</li>\\n<li>[meta] fix README.md (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/399\">#399</a>)</li>\\n<li>[meta] add FUNDING.yml</li>\\n<li>[actions] backport actions from main</li>\\n<li>[Tests] always use <code>String(x)</code> over <code>x.toString()</code></li>\\n<li>[Tests] remove nonexistent tape option</li>\\n<li>[Dev Deps] backport from main</li>\\n</ul>\\n</blockquote>\\n</details>\\n<details>\\n<summary>Commits</summary>\\n<ul>\\n<li><a href=\"https://github.com/ljharb/qs/commit/298bfa55d6db00ddea78dd0333509aadf9bb3077\"><code>298bfa5</code></a> v6.5.3</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/ed0f5dcbef4b168a8ae299d78b1e4a2e9b1baf1f\"><code>ed0f5dc</code></a> [Fix] <code>parse</code>: ignore <code>__proto__</code> keys (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/428\">#428</a>)</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/691e739cfa40cd42604dc05a54e6154371a429ab\"><code>691e739</code></a> [Robustness] <code>stringify</code>: avoid relying on a global <code>undefined</code> (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/427\">#427</a>)</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/1072d57d38a690e1ad7616dced44390bffedcbb2\"><code>1072d57</code></a> [readme] remove travis badge; add github actions/codecov badges; update URLs</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/12ac1c403aaa04d1a34844f514ed9f9abfb76e64\"><code>12ac1c4</code></a> [meta] fix README.md (<a href=\"https://github-redirect.dependabot.com/ljharb/qs/issues/399\">#399</a>)</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/0338716b09fdbd4711823eeb0a14e556a2498e7a\"><code>0338716</code></a> [actions] backport actions from main</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/5639c20ce0a7c1332200a3181339331483e5a3a1\"><code>5639c20</code></a> Clean up license text so it’s properly detected as BSD-3-Clause</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/51b8a0b1b213596dd1702b837f5e7dec2229793d\"><code>51b8a0b</code></a> add FUNDING.yml</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/45f675936e742d92fac8d4dae5cfc385c576a977\"><code>45f6759</code></a> [Fix] fix for an impossible situation: when the formatter is called with a no...</li>\\n<li><a href=\"https://github.com/ljharb/qs/commit/f814a7f8f2af059f8158f7e4b2bf8b46aeb62cd3\"><code>f814a7f</code></a> [Dev Deps] backport from main</li>\\n<li>Additional commits viewable in <a href=\"https://github.com/ljharb/qs/compare/v6.5.2...v6.5.3\">compare view</a></li>\\n</ul>\\n</details>\\n<br />\\n\\n\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=qs&package-manager=npm_and_yarn&previous-version=6.5.2&new-version=6.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\n\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\n\\n[//]: # (dependabot-automerge-start)\\n[//]: # (dependabot-automerge-end)\\n\\n---\\n\\n<details>\\n<summary>Dependabot commands and options</summary>\\n<br />\\n\\nYou can trigger Dependabot actions by commenting on this PR:\\n- `@dependabot rebase` will rebase this PR\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\n- `@dependabot merge` will merge this PR after your CI passes on it\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\n- `@dependabot reopen` will reopen this PR if it is closed\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\n\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/AbdElrahmanMuhammedNasr/WuzuufMasr/network/alerts).\\n\\n</details>',\n",
-       "  'pull_request.user.login': 'dependabot[bot]',\n",
-       "  'pull_request.user.id': 49699333,\n",
-       "  'pull_request.author_association': 'NONE',\n",
-       "  'pull_request.created_at': '2022-12-10T03:27:08Z',\n",
-       "  'pull_request.updated_at': '2022-12-10T03:27:08Z',\n",
-       "  'pull_request.closed_at': None,\n",
-       "  'pull_request.merged_at': None,\n",
-       "  'pull_request.merge_commit_sha': None,\n",
-       "  'pull_request.locked': False,\n",
-       "  'pull_request.assignee.login': None,\n",
-       "  'pull_request.assignee.id': None,\n",
-       "  'pull_request.assignee.type': None,\n",
-       "  'pull_request.assignee.site_admin': None,\n",
-       "  'pull_request.milestone.id': None,\n",
-       "  'pull_request.milestone.number': None,\n",
-       "  'pull_request.milestone.title': None,\n",
-       "  'pull_request.milestone.description': None,\n",
-       "  'pull_request.milestone.creator.login': None,\n",
-       "  'pull_request.milestone.creator.id': None,\n",
-       "  'pull_request.milestone.creator.type': None,\n",
-       "  'pull_request.milestone.creator.site_admin': None,\n",
-       "  'pull_request.milestone.open_issues': None,\n",
-       "  'pull_request.milestone.closed_issues': None,\n",
-       "  'pull_request.milestone.state': None,\n",
-       "  'pull_request.milestone.created_at': None,\n",
-       "  'pull_request.milestone.updated_at': None,\n",
-       "  'pull_request.milestone.due_on': None,\n",
-       "  'pull_request.milestone.closed_at': None,\n",
-       "  'pull_request.merged': False,\n",
-       "  'pull_request.mergeable': None,\n",
-       "  'pull_request.mergeable_state': 'unknown',\n",
-       "  'pull_request.merged_by.login': None,\n",
-       "  'pull_request.merged_by.id': None,\n",
-       "  'pull_request.merged_by.type': None,\n",
-       "  'pull_request.merged_by.site_admin': None,\n",
-       "  'pull_request.comments': 0,\n",
-       "  'pull_request.review_comments': 0,\n",
-       "  'pull_request.commits': 1,\n",
-       "  'pull_request.additions': 3,\n",
-       "  'pull_request.deletions': 3,\n",
-       "  'pull_request.changed_files': 1,\n",
-       "  'pull_request.label.id': None,\n",
-       "  'pull_request.label.name': None,\n",
-       "  'pull_request.label.color': None,\n",
-       "  'pull_request.label.default': None,\n",
-       "  'pull_request.head.label': 'AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3',\n",
-       "  'pull_request.head.ref': 'dependabot/npm_and_yarn/qs-6.5.3',\n",
-       "  'pull_request.head.sha': '94469b10a02fa77e95bb22aaa0fbcc16ef03edfd',\n",
-       "  'pull_request.head.user.login': 'AbdElrahmanMuhammedNasr',\n",
-       "  'pull_request.head.user.type': 'User',\n",
-       "  'pull_request.head.repo.name': 'WuzuufMasr',\n",
-       "  'pull_request.head.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n",
-       "  'pull_request.head.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n",
-       "  'pull_request.head.repo.owner.type': 'User',\n",
-       "  'pull_request.head.repo.private': False,\n",
-       "  'pull_request.head.repo.homepage': None,\n",
-       "  'pull_request.head.repo.description': None,\n",
-       "  'pull_request.head.repo.fork': False,\n",
-       "  'pull_request.head.repo.created_at': '2019-09-23T19:17:51Z',\n",
-       "  'pull_request.head.repo.updated_at': '2019-10-11T19:57:45Z',\n",
-       "  'pull_request.head.repo.pushed_at': '2022-12-10T03:27:07Z',\n",
-       "  'pull_request.head.repo.size': 1345,\n",
-       "  'pull_request.head.repo.stargazers_count': 0,\n",
-       "  'pull_request.head.repo.watchers_count': 0,\n",
-       "  'pull_request.head.repo.language': 'TypeScript',\n",
-       "  'pull_request.head.repo.has_issues': True,\n",
-       "  'pull_request.head.repo.has_projects': True,\n",
-       "  'pull_request.head.repo.has_downloads': True,\n",
-       "  'pull_request.head.repo.has_wiki': True,\n",
-       "  'pull_request.head.repo.has_pages': False,\n",
-       "  'pull_request.head.repo.forks_count': 0,\n",
-       "  'pull_request.head.repo.archived': False,\n",
-       "  'pull_request.head.repo.disabled': False,\n",
-       "  'pull_request.head.repo.open_issues_count': 24,\n",
-       "  'pull_request.head.repo.forks': 0,\n",
-       "  'pull_request.head.repo.open_issues': 24,\n",
-       "  'pull_request.head.repo.watchers': 0,\n",
-       "  'pull_request.head.repo.default_branch': 'master',\n",
-       "  'pull_request.head.repo.license.key': None,\n",
-       "  'pull_request.head.repo.license.spdx_id': None,\n",
-       "  'pull_request.head.repo.license.name': None,\n",
-       "  'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n",
-       "  'pull_request.base.ref': 'master',\n",
-       "  'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n",
-       "  'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n",
-       "  'pull_request.base.user.type': 'User',\n",
-       "  'pull_request.base.repo.name': 'WuzuufMasr',\n",
-       "  'pull_request.base.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n",
-       "  'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n",
-       "  'pull_request.base.repo.owner.type': 'User',\n",
-       "  'pull_request.base.repo.private': False,\n",
-       "  'pull_request.base.repo.homepage': None,\n",
-       "  'pull_request.base.repo.description': None,\n",
-       "  'pull_request.base.repo.fork': False,\n",
-       "  'pull_request.base.repo.created_at': '2019-09-23T19:17:51Z',\n",
-       "  'pull_request.base.repo.updated_at': '2019-10-11T19:57:45Z',\n",
-       "  'pull_request.base.repo.pushed_at': '2022-12-10T03:27:07Z',\n",
-       "  'pull_request.base.repo.size': 1345,\n",
-       "  'pull_request.base.repo.stargazers_count': 0,\n",
-       "  'pull_request.base.repo.watchers_count': 0,\n",
-       "  'pull_request.base.repo.language': 'TypeScript',\n",
-       "  'pull_request.base.repo.has_issues': True,\n",
-       "  'pull_request.base.repo.has_projects': True,\n",
-       "  'pull_request.base.repo.has_downloads': True,\n",
-       "  'pull_request.base.repo.has_wiki': True,\n",
-       "  'pull_request.base.repo.has_pages': False,\n",
-       "  'pull_request.base.repo.forks_count': 0,\n",
-       "  'pull_request.base.repo.archived': False,\n",
-       "  'pull_request.base.repo.disabled': False,\n",
-       "  'pull_request.base.repo.open_issues_count': 24,\n",
-       "  'pull_request.base.repo.forks': 0,\n",
-       "  'pull_request.base.repo.open_issues': 24,\n",
-       "  'pull_request.base.repo.watchers': 0,\n",
-       "  'pull_request.base.repo.default_branch': 'master',\n",
-       "  'pull_request.base.repo.license.key': None,\n",
-       "  'pull_request.base.repo.license.spdx_id': None,\n",
-       "  'pull_request.base.repo.license.name': None,\n",
-       "  'pull_request.guid': 'AbdElrahmanMuhammedNasr/WuzuufMasr/pull/35'}]"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import json\n",
-    "res = json.loads(small_ds[0]['pull_request.events'])\n",
-    "res"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'action': 'opened',\n",
-       "  'author': 'hillc-usgs',\n",
-       "  'comment': None,\n",
-       "  'comment_id': None,\n",
-       "  'datetime': '2021-06-24T17:23:03Z',\n",
-       "  'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
-       "  'title': 'pygeoapi_plugins refit',\n",
-       "  'type': 'issue'},\n",
-       " {'action': 'created',\n",
-       "  'author': 'rmcd-mscb',\n",
-       "  'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \",\n",
-       "  'comment_id': 868826717.0,\n",
-       "  'datetime': '2021-06-25 20:51:35+00:00',\n",
-       "  'description': None,\n",
-       "  'title': None,\n",
-       "  'type': 'comment'}]"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues = issues[0][\"events\"]\n",
-    "issues"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "10\n"
-     ]
-    }
-   ],
-   "source": [
-    "for i in range(3, 20):\n",
-    "    row = small_ds[i]\n",
-    "    events = load_json(row[\"pull_request.events\"])\n",
-    "    reviews = load_json(row[\"pull_request.code_review_events\"])\n",
-    "    issues = load_json(row[\"pull_request.issue_events\"])\n",
-    "    if reviews:\n",
-    "        print(i)\n",
-    "        break"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "len events 2, len reviews 1, len issues 1\n"
-     ]
-    }
-   ],
-   "source": [
-    "row = small_ds[10]\n",
-    "events = load_json(row[\"pull_request.events\"])\n",
-    "reviews = load_json(row[\"pull_request.code_review_events\"])\n",
-    "issues = load_json(row[\"pull_request.issue_events\"])\n",
-    "print(f\"len events {len(events)}, len reviews {len(reviews)}, len issues {len(issues)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events = load_json(row[\"pull_request.events\"])\n",
-    "reviews = load_json(row[\"pull_request.code_review_events\"])\n",
-    "issues = load_json(row[\"pull_request.issue_events\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "L = events + reviews + issues"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events = load_json(row[\"pull_request.events\"])\n",
-    "reviews = load_json(row[\"pull_request.code_review_events\"])\n",
-    "issues = load_json(row[\"pull_request.issue_events\"])\n",
-    "assert len(issues) == 1\n",
-    "issues_events = issues[0][\"events\"]\n",
-    "# for each events in each category group all events sorted by \"created_at\" in one list\n",
-    "for e in issues_events:\n",
-    "    e[\"created_at\"] = parse(e[\"datetime\"])\n",
-    "    del e[\"datetime\"]\n",
-    "events = [update_datetime(e) for e in events]\n",
-    "reviews = [update_datetime(e) for e in reviews]\n",
-    "all_events = sorted(\n",
-    "    events + reviews + issues_events,\n",
-    "    key=lambda x: x[\"created_at\"]\n",
-    ")\n",
-    "\n",
-    "pr_info = {k: events[0][k] for k in pull_request_info_cols}\n",
-    "head_info = {k: events[0][k] for k in head_info_cols}\n",
-    "base_info = {k:  events[0][k] for k in base_info_cols}\n",
-    "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
-    "comments = [{\"type\": e[\"type\"],\n",
-    "            \"action\": e[\"action\"],\n",
-    "            \"created_at\": e[\"created_at\"],\n",
-    "            \"review_comments\":  get_review_info(e),\n",
-    "            \"issues_comments\": get_issue_info(e)} for e in all_events]\n",
-    "new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 131,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['pull_request_info', 'head_repo_info', 'base_repo_info', 'events'])"
-      ]
-     },
-     "execution_count": 131,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "new_row.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 146,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "**GitHub Repo**: ACWI-SSWD/nldi_flowtools, PR Number: 4, ID: 677298606\n",
-      "**GitHub Link**: https://github.com/ACWI-SSWD/nldi_flowtools/pull/4\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Type: issue, action: opened, created_at: 2021-06-24 17:23:03+00:00\n",
-      "Author hillc-usgs did opened:\n",
-      "None\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Type: PullRequestEvent, action: opened, created_at: 2021-06-24 17:23:04+00:00\n",
-      "Author hillc-usgs with association None did opened\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Type: PullRequestReviewEvent, action: created, created_at: 2021-06-25 20:50:41+00:00\n",
-      "Author rmcd-mscb with association NONE did created\n",
-      "Review:\n",
-      "Thanks Cliff - Anders has been out this week, to keep things moving I'll merge the request and leave the branch for him to view when he gets back.  \n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Type: PullRequestEvent, action: closed, created_at: 2021-06-25 20:50:54+00:00\n",
-      "Author rmcd-mscb with association None did closed\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Type: comment, action: created, created_at: 2021-06-25 20:51:35+00:00\n",
-      "Author rmcd-mscb did created:\n",
-      "@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \n"
-     ]
-    }
-   ],
-   "source": [
-    "pr_info = new_row[\"pull_request_info\"]\n",
-    "res = f\"**GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}\"\n",
-    "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n",
-    "res += f\"\\n**GitHub Link**: {gh_link}\"\n",
-    "print(res)\n",
-    "for i in range(len(new_row[\"events\"])):\n",
-    "    e = new_row[\"events\"][i]\n",
-    "    print(\"-\" * 100)\n",
-    "    print(f\"Type: {e['type']}, action: {e['action']}, created_at: {e['created_at']}\")\n",
-    "    action = e['action']\n",
-    "\n",
-    "    if e['type'] in [\"issue\", \"comment\"]:\n",
-    "        e = e[\"issues_comments\"]\n",
-    "        print(f\"Author {e['author']} did {e['action']}:\\n{e['comment']}\")\n",
-    "\n",
-    "    elif e['type'] in [\"PullRequestEvent\", \"PullRequestReviewCommentEvent\", \"PullRequestReviewEvent\"]:\n",
-    "        reviews = e[\"review_comments\"]\n",
-    "        print(f\"Author {reviews['actor.login']} with association {reviews['review.author_association']} did {action}\")\n",
-    "        if reviews['review.body']:\n",
-    "            print(f\"Review:\\n{reviews['review.body']}\")\n",
-    "        if reviews['comment.body']:\n",
-    "            print(f\"Comment:\\n{reviews['comment.body']}\")\n",
-    "        if reviews['comment.diff_hunk']:\n",
-    "            print(f\"Diff hunk:\\n{reviews['diff_hunk']}\")\n",
-    "            print(f\"File path {reviews['path']}\")\n",
-    "    else:\n",
-    "        print(\"OTHER\")\n",
-    "        print(e[\"type\"])\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'pull_request.base.label': 'ACWI-SSWD:master',\n",
-       " 'pull_request.base.ref': 'master',\n",
-       " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n",
-       " 'pull_request.base.user.login': 'ACWI-SSWD',\n",
-       " 'pull_request.base.user.type': 'Organization',\n",
-       " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n",
-       " 'pull_request.base.repo.owner.type': 'Organization',\n",
-       " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
-       " 'pull_request.base.repo.default_branch': 'master',\n",
-       " 'pull_request.base.repo.description': None,\n",
-       " 'pull_request.base.repo.language': 'Python',\n",
-       " 'pull_request.base.repo.watchers_count': 3,\n",
-       " 'pull_request.base.repo.open_issues_count': 1,\n",
-       " 'pull_request.base.repo.forks_count': 0,\n",
-       " 'pull_request.base.repo.name': 'nldi_flowtools',\n",
-       " 'pull_request.base.repo.homepage': None,\n",
-       " 'pull_request.base.repo.stargazers_count': 3,\n",
-       " 'pull_request.base.repo.private': False,\n",
-       " 'pull_request.comments': 0,\n",
-       " 'pull_request.review_comments': 0,\n",
-       " 'pull_request.label.name': None}"
-      ]
-     },
-     "execution_count": 144,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pr_info"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 145,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[{'type': 'PullRequestEvent',\n",
-       "  'action': 'opened',\n",
-       "  'actor.login': 'hillc-usgs',\n",
-       "  'actor.id': 84474574,\n",
-       "  'user.login': None,\n",
-       "  'user.id': None,\n",
-       "  'user.type': None,\n",
-       "  'repo.name': 'ACWI-SSWD/nldi_flowtools',\n",
-       "  'repo.id': 365244721,\n",
-       "  'public': True,\n",
-       "  'created_at': datetime.datetime(2021, 6, 24, 17, 23, 4, tzinfo=tzlocal()),\n",
-       "  'org.id': 17301770,\n",
-       "  'org.login': 'ACWI-SSWD',\n",
-       "  'pull_request.id': 677298606,\n",
-       "  'pull_request.number': 4,\n",
-       "  'pull_request.state': 'open',\n",
-       "  'pull_request.title': 'pygeoapi_plugins refit',\n",
-       "  'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
-       "  'pull_request.user.login': 'hillc-usgs',\n",
-       "  'pull_request.user.id': 84474574,\n",
-       "  'pull_request.author_association': 'NONE',\n",
-       "  'pull_request.created_at': '2021-06-24T17:23:03Z',\n",
-       "  'pull_request.updated_at': '2021-06-24T17:23:03Z',\n",
-       "  'pull_request.closed_at': None,\n",
-       "  'pull_request.merged_at': None,\n",
-       "  'pull_request.merge_commit_sha': None,\n",
-       "  'pull_request.locked': False,\n",
-       "  'pull_request.assignee.login': None,\n",
-       "  'pull_request.assignee.id': None,\n",
-       "  'pull_request.assignee.type': None,\n",
-       "  'pull_request.assignee.site_admin': None,\n",
-       "  'pull_request.milestone.id': None,\n",
-       "  'pull_request.milestone.number': None,\n",
-       "  'pull_request.milestone.title': None,\n",
-       "  'pull_request.milestone.description': None,\n",
-       "  'pull_request.milestone.creator.login': None,\n",
-       "  'pull_request.milestone.creator.id': None,\n",
-       "  'pull_request.milestone.creator.type': None,\n",
-       "  'pull_request.milestone.creator.site_admin': None,\n",
-       "  'pull_request.milestone.open_issues': None,\n",
-       "  'pull_request.milestone.closed_issues': None,\n",
-       "  'pull_request.milestone.state': None,\n",
-       "  'pull_request.milestone.created_at': None,\n",
-       "  'pull_request.milestone.updated_at': None,\n",
-       "  'pull_request.milestone.due_on': None,\n",
-       "  'pull_request.milestone.closed_at': None,\n",
-       "  'pull_request.merged': False,\n",
-       "  'pull_request.mergeable': None,\n",
-       "  'pull_request.mergeable_state': 'unknown',\n",
-       "  'pull_request.merged_by.login': None,\n",
-       "  'pull_request.merged_by.id': None,\n",
-       "  'pull_request.merged_by.type': None,\n",
-       "  'pull_request.merged_by.site_admin': None,\n",
-       "  'pull_request.comments': 0,\n",
-       "  'pull_request.review_comments': 0,\n",
-       "  'pull_request.commits': 5,\n",
-       "  'pull_request.additions': 321,\n",
-       "  'pull_request.deletions': 25,\n",
-       "  'pull_request.changed_files': 5,\n",
-       "  'pull_request.label.id': None,\n",
-       "  'pull_request.label.name': None,\n",
-       "  'pull_request.label.color': None,\n",
-       "  'pull_request.label.default': None,\n",
-       "  'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n",
-       "  'pull_request.head.ref': 'pygeoapi_plugins-refit',\n",
-       "  'pull_request.head.sha': '9143699913269aff0814979d932957efeb002eb1',\n",
-       "  'pull_request.head.user.login': 'ACWI-SSWD',\n",
-       "  'pull_request.head.user.type': 'Organization',\n",
-       "  'pull_request.head.repo.name': 'nldi_flowtools',\n",
-       "  'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
-       "  'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n",
-       "  'pull_request.head.repo.owner.type': 'Organization',\n",
-       "  'pull_request.head.repo.private': False,\n",
-       "  'pull_request.head.repo.homepage': None,\n",
-       "  'pull_request.head.repo.description': None,\n",
-       "  'pull_request.head.repo.fork': False,\n",
-       "  'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n",
-       "  'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n",
-       "  'pull_request.head.repo.pushed_at': '2021-06-24T15:15:30Z',\n",
-       "  'pull_request.head.repo.size': 4309,\n",
-       "  'pull_request.head.repo.stargazers_count': 3,\n",
-       "  'pull_request.head.repo.watchers_count': 3,\n",
-       "  'pull_request.head.repo.language': 'Python',\n",
-       "  'pull_request.head.repo.has_issues': True,\n",
-       "  'pull_request.head.repo.has_projects': True,\n",
-       "  'pull_request.head.repo.has_downloads': True,\n",
-       "  'pull_request.head.repo.has_wiki': True,\n",
-       "  'pull_request.head.repo.has_pages': False,\n",
-       "  'pull_request.head.repo.forks_count': 0,\n",
-       "  'pull_request.head.repo.archived': False,\n",
-       "  'pull_request.head.repo.disabled': False,\n",
-       "  'pull_request.head.repo.open_issues_count': 1,\n",
-       "  'pull_request.head.repo.forks': 0,\n",
-       "  'pull_request.head.repo.open_issues': 1,\n",
-       "  'pull_request.head.repo.watchers': 3,\n",
-       "  'pull_request.head.repo.default_branch': 'master',\n",
-       "  'pull_request.head.repo.license.key': 'bsd-3-clause',\n",
-       "  'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n",
-       "  'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
-       "  'pull_request.base.label': 'ACWI-SSWD:master',\n",
-       "  'pull_request.base.ref': 'master',\n",
-       "  'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n",
-       "  'pull_request.base.user.login': 'ACWI-SSWD',\n",
-       "  'pull_request.base.user.type': 'Organization',\n",
-       "  'pull_request.base.repo.name': 'nldi_flowtools',\n",
-       "  'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
-       "  'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n",
-       "  'pull_request.base.repo.owner.type': 'Organization',\n",
-       "  'pull_request.base.repo.private': False,\n",
-       "  'pull_request.base.repo.homepage': None,\n",
-       "  'pull_request.base.repo.description': None,\n",
-       "  'pull_request.base.repo.fork': False,\n",
-       "  'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n",
-       "  'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n",
-       "  'pull_request.base.repo.pushed_at': '2021-06-24T15:15:30Z',\n",
-       "  'pull_request.base.repo.size': 4309,\n",
-       "  'pull_request.base.repo.stargazers_count': 3,\n",
-       "  'pull_request.base.repo.watchers_count': 3,\n",
-       "  'pull_request.base.repo.language': 'Python',\n",
-       "  'pull_request.base.repo.has_issues': True,\n",
-       "  'pull_request.base.repo.has_projects': True,\n",
-       "  'pull_request.base.repo.has_downloads': True,\n",
-       "  'pull_request.base.repo.has_wiki': True,\n",
-       "  'pull_request.base.repo.has_pages': False,\n",
-       "  'pull_request.base.repo.forks_count': 0,\n",
-       "  'pull_request.base.repo.archived': False,\n",
-       "  'pull_request.base.repo.disabled': False,\n",
-       "  'pull_request.base.repo.open_issues_count': 1,\n",
-       "  'pull_request.base.repo.forks': 0,\n",
-       "  'pull_request.base.repo.open_issues': 1,\n",
-       "  'pull_request.base.repo.watchers': 3,\n",
-       "  'pull_request.base.repo.default_branch': 'master',\n",
-       "  'pull_request.base.repo.license.key': 'bsd-3-clause',\n",
-       "  'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n",
-       "  'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
-       "  'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'},\n",
-       " {'type': 'PullRequestEvent',\n",
-       "  'action': 'closed',\n",
-       "  'actor.login': 'rmcd-mscb',\n",
-       "  'actor.id': 11791580,\n",
-       "  'user.login': None,\n",
-       "  'user.id': None,\n",
-       "  'user.type': None,\n",
-       "  'repo.name': 'ACWI-SSWD/nldi_flowtools',\n",
-       "  'repo.id': 365244721,\n",
-       "  'public': True,\n",
-       "  'created_at': datetime.datetime(2021, 6, 25, 20, 50, 54, tzinfo=tzlocal()),\n",
-       "  'org.id': 17301770,\n",
-       "  'org.login': 'ACWI-SSWD',\n",
-       "  'pull_request.id': 677298606,\n",
-       "  'pull_request.number': 4,\n",
-       "  'pull_request.state': 'closed',\n",
-       "  'pull_request.title': 'pygeoapi_plugins refit',\n",
-       "  'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
-       "  'pull_request.user.login': 'hillc-usgs',\n",
-       "  'pull_request.user.id': 84474574,\n",
-       "  'pull_request.author_association': 'NONE',\n",
-       "  'pull_request.created_at': '2021-06-24T17:23:03Z',\n",
-       "  'pull_request.updated_at': '2021-06-25T20:50:53Z',\n",
-       "  'pull_request.closed_at': '2021-06-25T20:50:53Z',\n",
-       "  'pull_request.merged_at': '2021-06-25T20:50:53Z',\n",
-       "  'pull_request.merge_commit_sha': 'c0a8e850c8e627b0474b9059582e7a61e5fd3699',\n",
-       "  'pull_request.locked': False,\n",
-       "  'pull_request.assignee.login': None,\n",
-       "  'pull_request.assignee.id': None,\n",
-       "  'pull_request.assignee.type': None,\n",
-       "  'pull_request.assignee.site_admin': None,\n",
-       "  'pull_request.milestone.id': None,\n",
-       "  'pull_request.milestone.number': None,\n",
-       "  'pull_request.milestone.title': None,\n",
-       "  'pull_request.milestone.description': None,\n",
-       "  'pull_request.milestone.creator.login': None,\n",
-       "  'pull_request.milestone.creator.id': None,\n",
-       "  'pull_request.milestone.creator.type': None,\n",
-       "  'pull_request.milestone.creator.site_admin': None,\n",
-       "  'pull_request.milestone.open_issues': None,\n",
-       "  'pull_request.milestone.closed_issues': None,\n",
-       "  'pull_request.milestone.state': None,\n",
-       "  'pull_request.milestone.created_at': None,\n",
-       "  'pull_request.milestone.updated_at': None,\n",
-       "  'pull_request.milestone.due_on': None,\n",
-       "  'pull_request.milestone.closed_at': None,\n",
-       "  'pull_request.merged': True,\n",
-       "  'pull_request.mergeable': None,\n",
-       "  'pull_request.mergeable_state': 'unknown',\n",
-       "  'pull_request.merged_by.login': 'rmcd-mscb',\n",
-       "  'pull_request.merged_by.id': 11791580,\n",
-       "  'pull_request.merged_by.type': 'User',\n",
-       "  'pull_request.merged_by.site_admin': False,\n",
-       "  'pull_request.comments': 0,\n",
-       "  'pull_request.review_comments': 0,\n",
-       "  'pull_request.commits': 7,\n",
-       "  'pull_request.additions': 292,\n",
-       "  'pull_request.deletions': 1,\n",
-       "  'pull_request.changed_files': 5,\n",
-       "  'pull_request.label.id': None,\n",
-       "  'pull_request.label.name': None,\n",
-       "  'pull_request.label.color': None,\n",
-       "  'pull_request.label.default': None,\n",
-       "  'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n",
-       "  'pull_request.head.ref': 'pygeoapi_plugins-refit',\n",
-       "  'pull_request.head.sha': '3e3fe0dfdfce5fe24c25231c3207c2d292b31165',\n",
-       "  'pull_request.head.user.login': 'ACWI-SSWD',\n",
-       "  'pull_request.head.user.type': 'Organization',\n",
-       "  'pull_request.head.repo.name': 'nldi_flowtools',\n",
-       "  'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
-       "  'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n",
-       "  'pull_request.head.repo.owner.type': 'Organization',\n",
-       "  'pull_request.head.repo.private': False,\n",
-       "  'pull_request.head.repo.homepage': None,\n",
-       "  'pull_request.head.repo.description': None,\n",
-       "  'pull_request.head.repo.fork': False,\n",
-       "  'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n",
-       "  'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n",
-       "  'pull_request.head.repo.pushed_at': '2021-06-25T20:50:53Z',\n",
-       "  'pull_request.head.repo.size': 4310,\n",
-       "  'pull_request.head.repo.stargazers_count': 3,\n",
-       "  'pull_request.head.repo.watchers_count': 3,\n",
-       "  'pull_request.head.repo.language': 'Python',\n",
-       "  'pull_request.head.repo.has_issues': True,\n",
-       "  'pull_request.head.repo.has_projects': True,\n",
-       "  'pull_request.head.repo.has_downloads': True,\n",
-       "  'pull_request.head.repo.has_wiki': True,\n",
-       "  'pull_request.head.repo.has_pages': False,\n",
-       "  'pull_request.head.repo.forks_count': 0,\n",
-       "  'pull_request.head.repo.archived': False,\n",
-       "  'pull_request.head.repo.disabled': False,\n",
-       "  'pull_request.head.repo.open_issues_count': 0,\n",
-       "  'pull_request.head.repo.forks': 0,\n",
-       "  'pull_request.head.repo.open_issues': 0,\n",
-       "  'pull_request.head.repo.watchers': 3,\n",
-       "  'pull_request.head.repo.default_branch': 'master',\n",
-       "  'pull_request.head.repo.license.key': 'bsd-3-clause',\n",
-       "  'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n",
-       "  'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
-       "  'pull_request.base.label': 'ACWI-SSWD:master',\n",
-       "  'pull_request.base.ref': 'master',\n",
-       "  'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n",
-       "  'pull_request.base.user.login': 'ACWI-SSWD',\n",
-       "  'pull_request.base.user.type': 'Organization',\n",
-       "  'pull_request.base.repo.name': 'nldi_flowtools',\n",
-       "  'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n",
-       "  'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n",
-       "  'pull_request.base.repo.owner.type': 'Organization',\n",
-       "  'pull_request.base.repo.private': False,\n",
-       "  'pull_request.base.repo.homepage': None,\n",
-       "  'pull_request.base.repo.description': None,\n",
-       "  'pull_request.base.repo.fork': False,\n",
-       "  'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n",
-       "  'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n",
-       "  'pull_request.base.repo.pushed_at': '2021-06-25T20:50:53Z',\n",
-       "  'pull_request.base.repo.size': 4310,\n",
-       "  'pull_request.base.repo.stargazers_count': 3,\n",
-       "  'pull_request.base.repo.watchers_count': 3,\n",
-       "  'pull_request.base.repo.language': 'Python',\n",
-       "  'pull_request.base.repo.has_issues': True,\n",
-       "  'pull_request.base.repo.has_projects': True,\n",
-       "  'pull_request.base.repo.has_downloads': True,\n",
-       "  'pull_request.base.repo.has_wiki': True,\n",
-       "  'pull_request.base.repo.has_pages': False,\n",
-       "  'pull_request.base.repo.forks_count': 0,\n",
-       "  'pull_request.base.repo.archived': False,\n",
-       "  'pull_request.base.repo.disabled': False,\n",
-       "  'pull_request.base.repo.open_issues_count': 0,\n",
-       "  'pull_request.base.repo.forks': 0,\n",
-       "  'pull_request.base.repo.open_issues': 0,\n",
-       "  'pull_request.base.repo.watchers': 3,\n",
-       "  'pull_request.base.repo.default_branch': 'master',\n",
-       "  'pull_request.base.repo.license.key': 'bsd-3-clause',\n",
-       "  'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n",
-       "  'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n",
-       "  'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'}]"
-      ]
-     },
-     "execution_count": 145,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "events"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'2021-06-24T17:23:03Z'"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "events[0][\"created_at\"]\n",
-    "issues[0][\"events\"][0][\"datetime\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 60,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'action': 'created',\n",
-       " 'author': 'rmcd-mscb',\n",
-       " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \",\n",
-       " 'comment_id': 868826717.0,\n",
-       " 'datetime': '2021-06-25 20:51:35+00:00',\n",
-       " 'description': None,\n",
-       " 'title': None,\n",
-       " 'type': 'comment'}"
-      ]
-     },
-     "execution_count": 60,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues[0][\"events\"][1]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])"
-      ]
-     },
-     "execution_count": 56,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues[0].keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "all_events = sorted(\n",
-    "    events + reviews + issues,\n",
-    "    key=lambda x: x[\"created_at\"]\n",
-    ")\n",
-    "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n",
-    "head_info = {k: all_events[-1][k] for k in head_info_cols}\n",
-    "base_info = {k:  all_events[-1][k] for k in base_info_cols}\n",
-    "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
-    "comments = [{\"comments\": e[\"pull_request.comments\"],\n",
-    "            \"review_comments\": e[\"pull_request.review_comments\"],\n",
-    "            \"extra_review_info\":  get_extra_review_info(e)} for e in all_events]\n",
-    "new_row = {\"pr_info\": pr_info, \"head_info\": head_info, \"base_info\": base_info, \"comments\": comments}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyError",
-     "evalue": "'created_at'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n",
-      "Cell \u001b[0;32mIn[36], line 106\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m    102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m    104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m    105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[0;32m--> 106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39;49m(\n\u001b[1;32m    107\u001b[0m     events \u001b[39m+\u001b[39;49m reviews \u001b[39m+\u001b[39;49m issues,\n\u001b[1;32m    108\u001b[0m     key\u001b[39m=\u001b[39;49m\u001b[39mlambda\u001b[39;49;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    109\u001b[0m )\n\u001b[1;32m    110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m    111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n",
-      "Cell \u001b[0;32mIn[36], line 108\u001b[0m, in \u001b[0;36mmerge_events.<locals>.<lambda>\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m    102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m    103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m    104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m    105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[1;32m    106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39m(\n\u001b[1;32m    107\u001b[0m     events \u001b[39m+\u001b[39m reviews \u001b[39m+\u001b[39m issues,\n\u001b[0;32m--> 108\u001b[0m     key\u001b[39m=\u001b[39m\u001b[39mlambda\u001b[39;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m    109\u001b[0m )\n\u001b[1;32m    110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m    111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n",
-      "\u001b[0;31mKeyError\u001b[0m: 'created_at'"
-     ]
-    }
-   ],
-   "source": [
-    "new_row = merge_events(row)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'repo': 'ACWI-SSWD/nldi_flowtools',\n",
-       " 'org': 'ACWI-SSWD',\n",
-       " 'issue_id': 929448726,\n",
-       " 'issue_number': 4,\n",
-       " 'pull_request': {'number': 4.0,\n",
-       "  'repo': 'nldi_flowtools',\n",
-       "  'user_login': 'ACWI-SSWD'},\n",
-       " 'events': [{'action': 'opened',\n",
-       "   'author': 'hillc-usgs',\n",
-       "   'comment': None,\n",
-       "   'comment_id': None,\n",
-       "   'datetime': '2021-06-24T17:23:03Z',\n",
-       "   'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n",
-       "   'title': 'pygeoapi_plugins refit',\n",
-       "   'type': 'issue'},\n",
-       "  {'action': 'created',\n",
-       "   'author': 'rmcd-mscb',\n",
-       "   'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back.  \",\n",
-       "   'comment_id': 868826717.0,\n",
-       "   'datetime': '2021-06-25 20:51:35+00:00',\n",
-       "   'description': None,\n",
-       "   'title': None,\n",
-       "   'type': 'comment'}]}"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "issues"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "# for each events in each category group all events sorted by \"created_at\" in one list\n",
-    "# then merge all three lists\n",
-    "# then sort by \"created_at\"\n",
-    "all_events = sorted(\n",
-    "    events + reviews + issues,\n",
-    "    key=lambda x: x[\"created_at\"]\n",
-    ")\n",
-    "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n",
-    "head_info = {k: all_events[-1][k] for k in head_info_cols}\n",
-    "base_info = {k:  all_events[-1][k] for k in base_info_cols}\n",
-    "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n",
-    "comments = [{\"comments\": e[\"pull_request.comments\"],\n",
-    "            \"review_comments\": e[\"pull_request.review_comments\"],\n",
-    "            \"extra_review_info\":  get_extra_review_info(e)} for e in all_events]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pull_request_info_cols = [\n",
-    "    \"repo.name\",\n",
-    "    \"repo.id\",\n",
-    "    \"org.id\",\n",
-    "    \"public\",\n",
-    "    \"pull_request.id\",\n",
-    "    \"pull_request.guid\",\n",
-    "    \"pull_request.number\",\n",
-    "    \"pull_request.title\",\n",
-    "    \"pull_request.body\",\n",
-    "    \"pull_request.state\",\n",
-    "    \"pull_request.user.login\",\n",
-    "    \"pull_request.user.id\",\n",
-    "    \"pull_request.created_at\",\n",
-    "    \"pull_request.closed_at\",\n",
-    "    \"pull_request.merged_at\",\n",
-    "    \"pull_request.merged_by.login\",\n",
-    "    \"pull_request.milestone.title\",\n",
-    "    \"pull_request.milestone.description\",\n",
-    "    \"pull_request.milestone.number\",\n",
-    "    # commits\n",
-    "    'pull_request.commits',\n",
-    "    'pull_request.additions',\n",
-    "    'pull_request.deletions',\n",
-    "    # changed files\n",
-    "    'pull_request.changed_files',\n",
-    "]\n",
-    "\n",
-    "comments = [\n",
-    "    'pull_request.comments',\n",
-    "    'pull_request.review_comments',\n",
-    "    # for PR event\n",
-    "    'pull_request.label.name',\n",
-    "    # review events only\n",
-    "    'review.state',\n",
-    "    'review.id', \n",
-    "    'review.body', \n",
-    "    'review.commit_id', \n",
-    "    'review.submitted_at', \n",
-    "    'review.author_association', '\n",
-    "]\n",
-    "\n",
-    "head_info_cols = [\n",
-    "    \"pull_request.head.label\",\n",
-    "    \"pull_request.head.ref\",\n",
-    "    \"pull_request.head.user.login\",\n",
-    "    \"pull_request.head.user.type\",\n",
-    "    \"pull_request.head.repo.owner.login\",\n",
-    "    \"pull_request.head.repo.owner.type\",\n",
-    "    \"pull_request.head.repo.license.name\",\n",
-    "    \"pull_request.head.sha\",\n",
-    "    'pull_request.head.repo.name',\n",
-    "    'pull_request.head.repo.owner.login',\n",
-    "    'pull_request.head.repo.homepage',\n",
-    "    'pull_request.head.repo.description',\n",
-    "    'pull_request.head.repo.language',\n",
-    "    'pull_request.head.repo.stargazers_count',\n",
-    "    'pull_request.head.repo.license.name',\n",
-    "    'pull_request.head.repo.default_branch',\n",
-    "    'pull_request.head.repo.private'\n",
-    "]\n",
-    "base_info_cols = [\n",
-    "    \"pull_request.base.label\",\n",
-    "    \"pull_request.base.ref\",\n",
-    "    \"pull_request.base.sha\",\n",
-    "    \"pull_request.base.user.login\",\n",
-    "    \"pull_request.base.user.type\",\n",
-    "    \"pull_request.base.repo.owner.login\",\n",
-    "    \"pull_request.base.repo.owner.type\",\n",
-    "    \"pull_request.base.repo.license.name\",\n",
-    "    \"pull_request.base.repo.default_branch\",\n",
-    "    \"pull_request.base.repo.description\",\n",
-    "    \"pull_request.base.repo.language\",\n",
-    "    \"pull_request.base.repo.watchers_count\",\n",
-    "    \"pull_request.base.repo.open_issues_count\",\n",
-    "    \"pull_request.base.repo.forks_count\",\n",
-    "    'pull_request.base.repo.name',\n",
-    "    'pull_request.base.repo.owner.login',\n",
-    "    'pull_request.base.repo.homepage',\n",
-    "    'pull_request.base.repo.description',\n",
-    "    'pull_request.base.repo.language',\n",
-    "    'pull_request.base.repo.stargazers_count',\n",
-    "    'pull_request.base.repo.private',\n",
-    "]\n",
-    "# drop \"repo.name\", \"repo.id\", \"public\" so they are not duplicated and keep relevant columns that might change\n",
-    "event_cols = [\n",
-    "    col\n",
-    "    for col in df.columns\n",
-    "    if (not col.startswith(\"pull_request.\"))\n",
-    "    and col not in [\"repo.name\", \"repo.id\", \"public\"]\n",
-    "] + [\n",
-    "    \"pull_request.head.label\",\n",
-    "    \"pull_request.head.ref\",\n",
-    "    \"pull_request.head.sha\",\n",
-    "    \"pull_request.title\",\n",
-    "]"
+    "merged_ds.push_to_hub(\"loubnabnl/code_reviews_500k\")"
    ]
   }
  ],