From 49d38c4d503734f041cdb0036e6c5f3176b45264 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Wed, 20 Sep 2023 15:31:51 +0200 Subject: [PATCH 1/5] add stackexchange code --- data_analysis/stackoverflow/h4_code/README.md | 26 + .../stackoverflow/h4_code/binarize.py | 117 +++ .../h4_code/stack_exchange_explore.py | 305 ++++++++ .../h4_code/stack_exchange_process.py | 718 ++++++++++++++++++ data_analysis/stackoverflow/other/main.py | 195 +++++ .../stackoverflow/other/requirements.txt | 5 + .../stackoverflow/other/se_reference_utils.py | 347 +++++++++ 7 files changed, 1713 insertions(+) create mode 100644 data_analysis/stackoverflow/h4_code/README.md create mode 100644 data_analysis/stackoverflow/h4_code/binarize.py create mode 100644 data_analysis/stackoverflow/h4_code/stack_exchange_explore.py create mode 100644 data_analysis/stackoverflow/h4_code/stack_exchange_process.py create mode 100644 data_analysis/stackoverflow/other/main.py create mode 100644 data_analysis/stackoverflow/other/requirements.txt create mode 100644 data_analysis/stackoverflow/other/se_reference_utils.py diff --git a/data_analysis/stackoverflow/h4_code/README.md b/data_analysis/stackoverflow/h4_code/README.md new file mode 100644 index 0000000..e918771 --- /dev/null +++ b/data_analysis/stackoverflow/h4_code/README.md @@ -0,0 +1,26 @@ +# Scripts for preference model pretraining data + +### Stack Exchange +Note: Stack Exchange Data Dump has a license requiring the addition of author's and links to the original material, see more [here](https://archive.org/details/stackexchange). + +1) `stack_exchange_explore.py`: example script for filtering stack exchange data to the question & answer format in Askell et al. 2021 on preference model pretraining (PMP). + +To run this code (from scratch including data download and faster processing), do the following: +Identify the raw data directory you're hoping to process, `ex_data_url`, and related data variables (further string optimizations can be added). +The script will pull raw data if you need it, uncompress it, and process the file to text. + +```shell +python scripts/data/pmp/stack_exchange_explore.py --stack_exchange=pets --save=True +``` + +2) `stack_exchange_process.py`: same as above, but designed to be run on a large machine to process all files consecutively. +It is a long for-loop over desired exchanges. + +```shell +python scripts/data/pmp/stack_exchange_process.py --save_path=/path/to/hf-dataset +``` + +3) `binarize.py`: used to binarize the pre-filter Stack Exchange data (and in the future, Reddit / Wikipedia) +```shell +python scripts/data/pmp/binarize.py --save_path=/path/to/hf-dataset +``` \ No newline at end of file diff --git a/data_analysis/stackoverflow/h4_code/binarize.py b/data_analysis/stackoverflow/h4_code/binarize.py new file mode 100644 index 0000000..79bcce8 --- /dev/null +++ b/data_analysis/stackoverflow/h4_code/binarize.py @@ -0,0 +1,117 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +from argparse import ArgumentParser +from pathlib import Path + +import numpy as np +from datasets import Dataset, concatenate_datasets, load_dataset + +from h4.data.utils import save_dataset_shards + + +H4_DIR = Path(__file__).resolve().parents[3] +DATA_DIR = H4_DIR / "data" + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--debug", action="store_true", help="Added print statements / limit data size for debugging") + parser.add_argument( + "--output_dir", + default=f"{DATA_DIR}/pmp-binarized", + type=str, + help="Where to save the processed dataset", + ) + parser.add_argument( + "--exchange_name", + type=str, + default=None, + help="Optional argument to specify a specific subsection of the dataset", + ) + parser.add_argument( + "--binary_score", type=int, default=8, help="Score assigned to binarized pairs for preference data." + ) + parser.add_argument( + "--stream_data", action="store_true", help="Optionally stream data, which can be useful with weaker computers" + ) + parser.set_defaults(debug=False, stream_data=False) # default will process full dataset + + args = parser.parse_args() + specific_exchange = args.exchange_name + stream_dataset = args.stream_data + binary_score = args.binary_score + + if specific_exchange: + data_dir = "data/" + args.exchange_name + else: + data_dir = None + + if args.debug: + data_len_limit = 10000 + else: + data_len_limit = np.inf + + dataset = load_dataset( + "HuggingFaceH4/pmp-stack-exchange", + data_dir=data_dir, + split="train", + streaming=stream_dataset, + ) + + pmp_data = [] + for i, d in enumerate(iter(dataset)): + # check debug limit, quit if in debug mode (don't save) + if i > data_len_limit: + print("Early exit for debug mode!") + print(pmp_data) + break + + question = d["question"] + answers = d["answers"] + num_answers = len(answers) + + answer_scores = [a["pm_score"] for a in answers] + if len(np.unique(answer_scores)) < 2: + print(f"PM Scores are {answer_scores}, skipping this question {i}") + else: + # Sample 2 unique scores for binarization + dif_scores = False + while not dif_scores: + # print("infinite loop...?") + two_answers = random.sample(answers, 2) + + if two_answers[0]["pm_score"] != two_answers[1]["pm_score"]: + dif_scores = True + + answer_0 = two_answers[0] + answer_1 = two_answers[1] + text_0 = "Question: " + question + "\n" + "Answer: " + answer_0["text"] + text_1 = "Question: " + question + "\n" + "Answer: " + answer_1["text"] + score_0 = binary_score + score_1 = binary_score + + pmp_data.append({"context": text_0, "score": score_0}) + pmp_data.append({"context": text_1, "score": score_1}) + + # Save binarized data + sublist_len = 100000 + + print(f"Dataset length is {len(pmp_data)}") + # bypass known issue in arrow https://issues.apache.org/jira/browse/ARROW-17137 + print(f"Processed dataset length > {sublist_len}, processing to HF dataset in chunks") + chunks = [pmp_data[x : x + sublist_len] for x in range(0, len(pmp_data), sublist_len)] + ds_chunks = [Dataset.from_list(ch) for ch in chunks] + ds = concatenate_datasets(ds_chunks) + + save_dataset_shards(ds, args.output_dir, subset="stackexchange", shard_size="100MB") diff --git a/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py b/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py new file mode 100644 index 0000000..33a8746 --- /dev/null +++ b/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py @@ -0,0 +1,305 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import os +import time + + +try: + from lxml import etree as ET +except ImportError: + import xml.etree.ElementTree as ET + +from argparse import ArgumentParser + +import numpy as np + + +parser = ArgumentParser() +parser.add_argument("--stack_exchange", default="ai", type=str, help="Which stack exchange data to process") +parser.add_argument( + "--save_to_text", default=False, type=bool, help="Whether or not the outputs are saved to a text file." +) +parser.add_argument("--debug", default=False, type=bool, help="Added print statements for debugging") + +args = parser.parse_args() + +save = args.save_to_text +se_name = args.stack_exchange + ".stackexchange.com" +DEBUG = args.debug + + +start_time = time.time() + +data_dir = "data/" +if not os.path.exists(data_dir): + os.mkdir(data_dir) + +# check if unpacked data exists: +ex_data_file = data_dir + se_name + "/Posts.xml" +if not os.path.exists(ex_data_file): + # get raw data + ex_data_file_7z = se_name + ".7z" + if not os.path.exists(data_dir + ex_data_file_7z): + print("Loading raw data, this can take a second!") + import py7zr + import requests + + ex_data_url = ( + "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/resolve/main/" + + ex_data_file_7z + ) + response = requests.get(ex_data_url, allow_redirects=True) + filename = os.path.basename(ex_data_url) + + if response.status_code == 200: + with open(data_dir + filename, "wb") as out: + out.write(response.content) + os.mkdir(data_dir + se_name) + with py7zr.SevenZipFile(data_dir + filename, "r") as archive: + archive.extractall(data_dir + se_name + "/") + else: + print("Request failed: %d" % response.status_code) + + print("Loaded data, now processing!") + +# load extracted xml files +local_path = data_dir + se_name + "/" # "ai.stackexchange.com/" +posts_subpath = "Posts.xml" +votes_subpath = "Votes.xml" +users_subpath = "Users.xml" + +""" +XML file structure: +* PostTypeID ranges from 1: Question, 2: Answer, .... +* We only want posts with AcceptedAnswerId fields + +(docs https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede) +""" + + +def print_dict(d): + for key, val in d.items(): + print(f"{key}, {val}") + + +def simplify_date(date_string): + date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S") + return date.strftime("%Y/%m/%d") + + +user_info = {-1: "(user-deleted)"} +question_info = {} +answer_info = {} + +# extract user data for license +with open(local_path + users_subpath, "rb") as f: # Users file + tree = ET.parse(f) + for exchange in tree.iter("row"): + tag = int(exchange.attrib["Id"]) + user_info[tag] = str(exchange.attrib["DisplayName"]) + +if DEBUG: + print_dict(user_info) + +with open(local_path + posts_subpath, "rb") as f: # Posts file + tree = ET.parse(f) + + # process questions, find answers next + # note, could do this all in one loop and store anything is memory is cheaper than processing speed + + # iterator through all rows + for exchange in tree.iter("row"): + # find 2+ answers + if "AnswerCount" in exchange.attrib: + ans_count = int(exchange.attrib["AnswerCount"]) + + # only save questions with >= 2 answers + if ans_count >= 2: + tag = int(exchange.attrib["Id"]) + + result = {} + result["Body"] = exchange.attrib["Body"] + + # store some metadata + result["AnswerCount"] = ans_count + result["PostScore"] = int(exchange.attrib["Score"]) + + # save metadata + if "OwnerUserId" in exchange.attrib: + user_id = int(exchange.attrib["OwnerUserId"]) + else: + user_id = -1 # deleted user redirect to community page + + result["Author"] = user_id # should fail for some deleted entries + result["metadata"] = [ + "https://" + se_name + "/questions/" + str(tag), + "https://" + se_name, + "https://" + + se_name + + "/users/" + + str(user_id) + + "/", # don't include username afterwards to avoid case with spaces in name (string regex problem) + ] + result["Date"] = simplify_date(exchange.attrib["CreationDate"]) + + # if accepted answer, store it + if "AcceptedAnswerId" in exchange.attrib: + accepted_ans = int(exchange.attrib["AcceptedAnswerId"]) + result["AcceptedAnswerId"] = accepted_ans + else: + result["AcceptedAnswerId"] = None + + question_info[tag] = result + if DEBUG: + print_dict(question_info[tag]) + + # process looking for answers + for i, exchange in enumerate(tree.iter("row")): + # answers are ID type 2 + if int(exchange.attrib["PostTypeId"]) == 2: + # get parent, check if in question_info + parent = int(exchange.attrib["ParentId"]) + # note, that parent will be same as tag above in answer_info and question_info + + # log if parent is in questions (multiple answers for preference model) + if parent in question_info: + # info for answers + ans_text = exchange.attrib["Body"] + ans_score = int(exchange.attrib["Score"]) + ans_id = int(exchange.attrib["Id"]) # extra score if this ID matches accept id above + + # save metadata + if "OwnerUserId" in exchange.attrib: + user_id = int(exchange.attrib["OwnerUserId"]) + else: + user_id = -1 # deleted user + # we'll need to store multiple answers per tag + if parent not in answer_info: + answer_info[parent] = {} + answer_info[parent]["Text"] = [] + answer_info[parent]["Score"] = [] + answer_info[parent]["Id"] = [] + answer_info[parent]["Author"] = [] + answer_info[parent]["AuthorNames"] = [] + + answer_info[parent]["Text"].append(ans_text) + answer_info[parent]["Score"].append(ans_score) + answer_info[parent]["Id"].append(ans_id) + answer_info[parent]["Author"].append(user_id) # should fail for some deleted entries + answer_info[parent]["AuthorNames"].append(user_info[user_id]) + + if DEBUG: + print_dict(answer_info[parent]) + +# don't debug and save +if DEBUG: + quit() + +qa_keys = question_info.keys() +if save: + import json + + output_file = open(data_dir + "output.jsonl", "w") + +final_outputs = {"domain": args.stack_exchange} +print(" ------ printing processed questions ------ ------ ------ ------ ------ ------ ") +for k in qa_keys: + question_data = question_info[k] + if not save: + print(" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ") + print(f"Question (id: {k}): {question_data['Body']}") + + accepted_ans = question_data["AcceptedAnswerId"] + + answer_data = answer_info[k] + metadata = question_data["metadata"] + date = question_data["Date"] + # filter for number of unique scores to be >= 2 (per paper) + scores = answer_data["Score"] + if len(np.unique(scores)) >= 2: + answers = [] + for i, (text, score, ans_id, auth_name, auth_id) in enumerate( + zip(answer_data["Text"], scores, answer_data["Id"], answer_data["AuthorNames"], answer_data["Author"]) + ): + sub_answer = {} + accepted = accepted_ans == ans_id + + if score >= 0: + s = round(np.log2(1 + score)) + + # not documented if negative answers can be accepted, assuming no + if accepted: # add 1 to score if answer was accepted + s += 1 + else: + s = -1 + + # print or save, *** indicates preferred answer + pref = ", ***" if accepted else "" + sub_answer["AnswerID"] = ans_id + sub_answer["text"] = text + sub_answer["pm_score"] = s + sub_answer["selected"] = accepted + sub_answer["Author"] = auth_name + sub_answer["AuthorID"] = auth_id + sub_answer["AuthorProfile"] = "https://" + se_name + "/users/" + str(auth_id) + answers.append(sub_answer) + if not save: + print(f"Answer (id {ans_id}, s:{s}{pref}): {text}") + print(" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ") + + if save: + json_obj = { + "qid": k, + "question": question_data["Body"], + "answers": answers, + "date": date, + "metadata": metadata, + } + json.dump(json_obj, output_file) + +print(f"finished at {time.time() - start_time}s") +""" +Added options/notes for scaling & changing this script + +Adding a dataloader to use HuggingFace Datasets +`from datasets import load_dataset` +----- + +Logs on loading 7z files: +Example for samsum dataset:: +https://github.com/huggingface/datasets/blob/fedf891a08bfc77041d575fad6c26091bc0fce52/datasets/samsum/samsum.py#L106-L110 +----- + +Making a cleaner repo + dataloader out of the raw data here: +https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/tree/main +* move many files into folder (how to do that without loading)? +* add data loader (see above, shouldn't be so hard) +* figure out storage datatype of the processed data +---- + +Maybe consider using Beautiful Soup? +https://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +# list files in the raw repository +from huggingface_hub import HfApi +api = HfApi() + +se_files = api.list_repo_files("flax-sentence-embeddings/stackexchange_xml", repo_type="dataset") +se_data_files = [f for f in se_files if "7z" in f] +se_names = [f[:f.find(".")] for f in se_files if "7z" in f] +se_names = [f + ".meta" if (i%2) == 0 else f for i, f in enumerate(se_names)] +# print(se_data_files) + +""" diff --git a/data_analysis/stackoverflow/h4_code/stack_exchange_process.py b/data_analysis/stackoverflow/h4_code/stack_exchange_process.py new file mode 100644 index 0000000..11d7f31 --- /dev/null +++ b/data_analysis/stackoverflow/h4_code/stack_exchange_process.py @@ -0,0 +1,718 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import os +import time + +from datasets import Dataset, concatenate_datasets + +import py7zr +import requests +from h4.data.utils import save_dataset_shards + + +try: + from lxml import etree as ET +except ImportError: + import xml.etree.ElementTree as ET + +from argparse import ArgumentParser +from pathlib import Path + +import numpy as np + + +H4_DIR = Path(__file__).resolve().parents[3] +# TODO: Ideally we would use PosixPath here, but it doesn't work with the way the script is implemented :) +DATA_DIR = str(H4_DIR) + "/data/pmp-stack-exchange/" + +# stack exchanges we filter +ALL_EXCHANGES = [ + "3dprinting.meta", + "3dprinting", + "academia.meta", + "academia", + "ai.meta", + "ai", + "android.meta", + "android", + "anime.meta", + "anime", + "apple.meta", + "apple", + "arduino.meta", + "arduino", + "askubuntu", + "astronomy", + "astronomy.meta", + "aviation", + "aviation.meta", + "avp", + "avp.meta", + "beer", + "beer.meta", + "bicycles", + "bicycles.meta", + "bioinformatics", + "bioinformatics.meta", + "biology", + "biology.meta", + "bitcoin", + "bitcoin.meta", + "blender", + "blender.meta", + "boardgames", + "boardgames.meta", + "bricks", + "bricks.meta", + "buddhism", + "buddhism.meta", + "cardano", + "cardano.meta", + "chemistry", + "chemistry.meta", + "chess", + "chess.meta", + "chinese", + "chinese.meta", + "christianity", + "christianity.meta", + "civicrm", + "civicrm.meta", + "codegolf", + "codegolf.meta", + "codereview", + "codereview.meta", + "coffee", + "coffee.meta", + "cogsci", + "cogsci.meta", + "computergraphics", + "computergraphics.meta", + "conlang", + "conlang.meta", + "cooking", + "cooking.meta", + "craftcms", + "craftcms.meta", + "crafts", + "crafts.meta", + "crypto", + "crypto.meta", + "cs", + "cs.meta", + "cseducators", + "cseducators.meta", + "cstheory", + "cstheory.meta", + "datascience", + "datascience.meta", + "dba", + "dba.meta", + "devops", + "devops.meta", + "diy", + "diy.meta", + "drones", + "drones.meta", + "drupal", + "drupal.meta", + "dsp", + "dsp.meta", + "earthscience", + "earthscience.meta", + "ebooks", + "ebooks.meta", + "economics", + "economics.meta", + "electronics", + "electronics.meta", + "elementaryos", + "elementaryos.meta", + "ell", + "ell.meta", + "emacs", + "emacs.meta", + "engineering", + "engineering.meta", + "english", + "english.meta", + "eosio", + "eosio.meta", + "esperanto", + "esperanto.meta", + "ethereum", + "ethereum.meta", + "expatriates", + "expatriates.meta", + "expressionengine", + "expressionengine.meta", + "fitness", + "fitness.meta", + "freelancing", + "freelancing.meta", + "french", + "french.meta", + "gamedev", + "gamedev.meta", + "gaming", + "gaming.meta", + "gardening", + "gardening.meta", + "genealogy", + "genealogy.meta", + "german", + "german.meta", + "gis", + "gis.meta", + "graphicdesign", + "graphicdesign.meta", + "ham", + "ham.meta", + "hardwarerecs", + "hardwarerecs.meta", + "health", + "health.meta", + "hermeneutics", + "hermeneutics.meta", + "hinduism", + "hinduism.meta", + "history", + "history.meta", + "homebrew", + "homebrew.meta", + "hsm", + "hsm.meta", + "interpersonal", + "interpersonal.meta", + "iot", + "iot.meta", + "iota", + "iota.meta", + "islam", + "islam.meta", + "italian", + "italian.meta", + "japanese", + "japanese.meta", + "joomla", + "joomla.meta", + "judaism", + "judaism.meta", + "korean", + "korean.meta", + "languagelearning", + "languagelearning.meta", + "latin", + "latin.meta", + "law", + "law.meta", + "lifehacks", + "lifehacks.meta", + "linguistics", + "linguistics.meta", + "literature", + "literature.meta", + "magento", + "magento.meta", + "martialarts", + "martialarts.meta", + "materials", + "materials.meta", + "math", + "math.meta", + "matheducators", + "matheducators.meta", + "mathematica", + "mathematica.meta", + "mathoverflow", + "mechanics.meta", + "mechanics", + "meta.askubuntu", + "meta.mathoverflow", + "meta.serverfault", + "meta.stackexchange", + "meta.stackoverflow", + "meta.superuser", + "moderators.meta", + "moderators", + "monero.meta", + "monero", + "money.meta", + "money", + "movies.meta", + "movies", + "music.meta", + "music", + "musicfans.meta", + "musicfans", + "mythology.meta", + "mythology", + "networkengineering.meta", + "networkengineering", + "opendata.meta", + "opendata", + "opensource.meta", + "opensource", + "or.meta", + "or", + "outdoors.meta", + "outdoors", + "parenting.meta", + "parenting", + "patents.meta", + "patents", + "pets.meta", + "pets", + "philosophy.meta", + "philosophy", + "photo.meta", + "photo", + "physics.meta", + "physics", + "pm.meta", + "pm", + "poker.meta", + "poker", + "politics.meta", + "politics", + "portuguese.meta", + "portuguese", + "puzzling.meta", + "puzzling", + "quant.meta", + "quant", + "quantumcomputing.meta", + "quantumcomputing", + "raspberrypi.meta", + "raspberrypi", + "retrocomputing.meta", + "retrocomputing", + "reverseengineering.meta", + "reverseengineering", + "robotics.meta", + "robotics", + "rpg.meta", + "rpg", + "rus.meta", + "rus", + "russian.meta", + "russian", + "salesforce.meta", + "salesforce", + "scicomp.meta", + "scicomp", + "scifi.meta", + "scifi", + "security.meta", + "security", + "serverfault", + "sharepoint", + "sharepoint.meta", + "sitecore", + "sitecore.meta", + "skeptics", + "skeptics.meta", + "softwareengineering", + "softwareengineering.meta", + "softwarerecs", + "softwarerecs.meta", + "sound", + "sound.meta", + "space", + "space.meta", + "spanish", + "spanish.meta", + "sports", + "sports.meta", + "sqa", + "sqa.meta", + "stackapps", + "stats.meta", + "stats", + "stellar.meta", + "stellar", + "superuser", + "sustainability", + "sustainability.meta", + "tex", + "tex.meta", + "tezos", + "tezos.meta", + "tor", + "tor.meta", + "travel", + "travel.meta", + "tridion", + "tridion.meta", + "ukrainian", + "ukrainian.meta", + "unix", + "unix.meta", + "ux", + "ux.meta", + "vegetarianism", + "vegetarianism.meta", + "vi", + "vi.meta", + "webapps", + "webapps.meta", + "webmasters", + "webmasters.meta", + "windowsphone", + "windowsphone.meta", + "woodworking", + "woodworking.meta", + "wordpress", + "wordpress.meta", + "workplace", + "workplace.meta", + "worldbuilding", + "worldbuilding.meta", + "writers", + "writers.meta", + "Stackoverflow", # hardcoded for different URL structure +] + +# Some excluded stack exchanges below (not a maintained list) +# spanish: es.meta.stackoverflow.com.7z, es.stackoverflow.com.7z +# japanese: ja.meta.stackoverflow.com.7z, ja.stackoverflow.com.7z +# some language: pt.stackoverflow.com, pt.meta.stackoverflow.com +# ru.stackoverflow, ru.meta.stackoverflow + +# stack exchanges with different processing, these end in .net ;( +DOTNET_LIST = ["mathoverflow", "meta.mathoverflow"] + +# stack exchanges without .stackoverflow.com (includes above) +SHORT_URL_LIST = [ + "askubuntu", + "meta.askubuntu", + "meta.serverfault", + "meta.stackexchange", + "meta.stackoverflow", + "stackexchange", + "superuser", + "meta.superuser", + "serverfault", + "stackapps", + "Stackoverflow", +] +SHORT_URL_LIST += DOTNET_LIST + + +def get_and_unpack_7z(directory: str, data_save_dir: str, save_dir_override: str = None): + # check if unpacked data exists (no need to re-download): + se_name_7z = directory[directory.rfind("/") + 1 :] + se_name = se_name_7z[:-3] + assert ".7z" == se_name_7z[-3:] + if not os.path.exists(data_save_dir + se_name_7z): + print("Loading raw data, this can take a second!") + + ex_data_url = ( + # "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/resolve/main/"\ + "https://archive.org/download/stackexchange/" + + se_name_7z + ) + + response = requests.get(ex_data_url, allow_redirects=True) + filename = os.path.basename(ex_data_url) + + print("Unpacking raw data.") + if response.status_code == 200: + with open(DATA_DIR + filename, "wb") as out: + out.write(response.content) + os.mkdir(DATA_DIR + se_name) + with py7zr.SevenZipFile(DATA_DIR + filename, "r") as archive: + if save_dir_override: + save_dir = save_dir_override + else: + save_dir = se_name + archive.extractall(DATA_DIR + save_dir + "/") + else: + print("Request failed: %d" % response.status_code) + + print("Loaded & unpacked data, now processing...") + else: + print("Raw 7z data already exists for this dir :)") + + +def print_dict(d): + for key, val in d.items(): + print(f"{key}, {val}") + + +def simplify_date(date_string): + date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S") + return date.strftime("%Y/%m/%d") + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument( + "--all", + action="store_true", + help="If the script will process all stack exchanges: warning, requires large amount of RAM", + ) + parser.add_argument("--save_path", default=DATA_DIR, type=str, help="Path to the huggingface dataset preferably.") + parser.add_argument( + "--start_idx", + default=0, + type=int, + help="Optional value to skip a number of exchanges in the above list if processing crashed midway", + ) + parser.add_argument("--shard_size", default=100, type=int, help="Maximum size of file for subsets of data in MB") + parser.add_argument("--debug", action="store_true", help="Added print statements for debugging") + parser.set_defaults(debug=False, all=False) + + args = parser.parse_args() + + shard_size = str(args.shard_size) + "MB" + process_all = args.all + save_path = args.save_path + start_idx = args.start_idx + DEBUG = args.debug + if process_all: + se_list = ALL_EXCHANGES + else: + print("Run from command line with --all=True to process all data") + se_list = ["ai", "apple", "pets", "ai.meta"] + + os.makedirs(DATA_DIR, exist_ok=True) + + # Process all exchanges in loop (saves in memory) + TOTAL = len(se_list) - 1 + for i, se_sub_name in enumerate(se_list[start_idx:]): + print(f"SECTION {i + start_idx}/{TOTAL}: {se_sub_name} - START") + + # some stack exchanges don't use .stackexchange.com + if se_sub_name not in SHORT_URL_LIST: + se_full_name = se_sub_name + ".stackexchange.com" + elif se_sub_name in DOTNET_LIST: # two exchanges need .net + se_full_name = se_sub_name + ".net" + else: + se_full_name = se_sub_name + ".com" + + start_time = time.time() + full_section_data = [] + + # https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z + # https://archive.org/download/stackexchange/Stackoverflow.com-Users.7z + + # get_and_unpack_7z() + ex_data_file = DATA_DIR + se_full_name + "/Users.xml" + # check if unpacked data exists: + if not os.path.exists(ex_data_file): + # get raw data + ex_data_file_7z = se_full_name + ".7z" + if "Stackoverflow.com" in ex_data_file_7z: + base_stackoverflow_dir = ex_data_file_7z[:-3] + get_and_unpack_7z( + base_stackoverflow_dir + "-Posts.7z", DATA_DIR, save_dir_override="stackoverflow.com" + ) + get_and_unpack_7z( + base_stackoverflow_dir.lower() + "-Users.7z", DATA_DIR, save_dir_override="stackoverflow.com" + ) # users dir only is lowercase s + else: + get_and_unpack_7z(ex_data_file_7z, DATA_DIR) + + # load extracted xml files + local_path = ( + DATA_DIR + se_full_name.lower() + "/" + ) # "ai.stackexchange.com/" # again, .lower() for the Stackexchange.com/Users + posts_subpath = "Posts.xml" + users_subpath = "Users.xml" + + """ + XML file structure: + * PostTypeID ranges from 1: Question, 2: Answer, .... + * We only want posts with AcceptedAnswerId fields + (docs https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede) + """ + + user_info = {-1: "(user-deleted)"} + question_info = {} + answer_info = {} + + # extract user data for license + with open(local_path + users_subpath, "rb") as f: # Users file + tree = ET.parse(f) + for exchange in tree.iter("row"): + tag = int(exchange.attrib["Id"]) + user_info[tag] = str(exchange.attrib["DisplayName"]) + + if DEBUG: + print_dict(user_info) + + with open(local_path + posts_subpath, "rb") as f: # Posts file + tree = ET.parse(f) + + # process questions, find answers next + # note, could do this all in one loop and store anything is memory is cheaper than processing speed + + # iterator through all rows + for exchange in tree.iter("row"): + # find 2+ answers + if "AnswerCount" in exchange.attrib: + ans_count = int(exchange.attrib["AnswerCount"]) + + # only save questions with >= 2 answers + if ans_count >= 2: + tag = int(exchange.attrib["Id"]) + + result = {} + result["Body"] = exchange.attrib["Body"] + + # store some metadata + result["AnswerCount"] = ans_count + result["PostScore"] = int(exchange.attrib["Score"]) + + # save metadata + if "OwnerUserId" in exchange.attrib: + user_id = int(exchange.attrib["OwnerUserId"]) + else: + user_id = -1 # deleted user redirect to community page + + result["Author"] = user_id # should fail for some deleted entries + result["metadata"] = [ + "https://" + se_full_name + "/questions/" + str(tag), # question URL + "https://" + se_full_name, # Exchange URL + "https://" + + se_full_name + + "/users/" + + str(user_id) + + "/", # Author URL -- don't include username afterwards to avoid case with spaces in name (string regex problem) + ] + result["Date"] = simplify_date(exchange.attrib["CreationDate"]) + + # if accepted answer, store it + if "AcceptedAnswerId" in exchange.attrib: + accepted_ans = int(exchange.attrib["AcceptedAnswerId"]) + result["AcceptedAnswerId"] = accepted_ans + else: + result["AcceptedAnswerId"] = None + + question_info[tag] = result + if DEBUG: + print_dict(question_info[tag]) + + # process looking for answers + for exchange in tree.iter("row"): + # answers are ID type 2 + if int(exchange.attrib["PostTypeId"]) == 2: + # get parent, check if in question_info + parent = int(exchange.attrib["ParentId"]) + # note, that parent will be same as tag above in answer_info and question_info + + # log if parent is in questions (multiple answers for preference model) + if parent in question_info: + # info for answers + ans_text = exchange.attrib["Body"] + ans_score = int(exchange.attrib["Score"]) + ans_id = int(exchange.attrib["Id"]) # extra score if this ID matches accept id above + + # save metadata + if "OwnerUserId" in exchange.attrib: + user_id = int(exchange.attrib["OwnerUserId"]) + else: + user_id = -1 # deleted user + # we'll need to store multiple answers per tag + if parent not in answer_info: + answer_info[parent] = {} + answer_info[parent]["Text"] = [] + answer_info[parent]["Score"] = [] + answer_info[parent]["Id"] = [] + answer_info[parent]["Author"] = [] + answer_info[parent]["AuthorNames"] = [] + + answer_info[parent]["Text"].append(ans_text) + answer_info[parent]["Score"].append(ans_score) + answer_info[parent]["Id"].append(ans_id) + answer_info[parent]["Author"].append(user_id) # should fail for some deleted entries + # fix rare case that the username for answer authors is not in the database + if user_id in user_info: + username = user_info[user_id] + else: + username = "(user-not-found)" + answer_info[parent]["AuthorNames"].append(username) + + if DEBUG: + print_dict(answer_info[parent]) + + qa_keys = question_info.keys() + + final_outputs = {"domain": se_sub_name} + + for k in qa_keys: + question_data = question_info[k] + + accepted_ans = question_data["AcceptedAnswerId"] + + answer_data = answer_info[k] + metadata = question_data["metadata"] + date = question_data["Date"] + + # filter for number of unique scores to be >= 2 (per paper) + scores = answer_data["Score"] + if len(np.unique(scores)) >= 2: + answers = [] + for text, score, ans_id, auth_name, auth_id in zip( + answer_data["Text"], scores, answer_data["Id"], answer_data["AuthorNames"], answer_data["Author"] + ): + sub_answer = {} + accepted = accepted_ans == ans_id + + if score >= 0: + s = round(np.log2(1 + score)) + + # not documented if negative answers can be accepted, assuming no + if accepted: # add 1 to score if answer was accepted + s += 1 + else: + s = -1 + + sub_answer["answer_id"] = ans_id + sub_answer["text"] = text + sub_answer["pm_score"] = s + sub_answer["selected"] = accepted + sub_answer["author"] = auth_name + sub_answer["author_id"] = auth_id + sub_answer["author_profile"] = "https://" + se_full_name + "/users/" + str(auth_id) + answers.append(sub_answer) + + json_obj = { + "qid": k, + "question": question_data["Body"], + "answers": answers, + "date": date, + "metadata": metadata, + } + full_section_data.append(json_obj) + + print(f"finished section {se_full_name} at {time.time() - start_time}s") + + if not DEBUG: + sublist_len = 100000 + + # bypass known issue in arrow https://issues.apache.org/jira/browse/ARROW-17137 + if len(full_section_data) > sublist_len: + print(f"Processed dataset length > {sublist_len}, processing to HF dataset in chunks") + chunks = [ + full_section_data[x : x + sublist_len] for x in range(0, len(full_section_data), sublist_len) + ] + ds_chunks = [Dataset.from_list(ch) for ch in chunks] + ds = concatenate_datasets(ds_chunks) + else: + ds = Dataset.from_list(full_section_data) + + save_dataset_shards(ds, save_path, subset=se_full_name, shard_size=shard_size) diff --git a/data_analysis/stackoverflow/other/main.py b/data_analysis/stackoverflow/other/main.py new file mode 100644 index 0000000..65a88bb --- /dev/null +++ b/data_analysis/stackoverflow/other/main.py @@ -0,0 +1,195 @@ +# Inspired by https://github.com/huggingface/h4/blob/main/scripts/data/pmp/stack_exchange_process.py +import datetime +import os +import time +import xml.etree.ElementTree as ET +from collections import defaultdict + +from datasets import Dataset, concatenate_datasets +from tqdm import tqdm + +# Note: Using rclone + py7zr in command line is often faster than this +import py7zr +import requests + +# If the cleaning becomes a bottleneck at some point, could be better to use +# this snippet from Anton https://gist.github.com/anton-l/4bfafb42878a8e77b20f3b844d9cae36 +# (uses selectolax, faster than bs4) instead. +from bs4 import BeautifulSoup +from se_reference_utils import ALL_EXCHANGES + + +DATA_DIR = "data/stack-exchange" +WTOKEN = os.getenv("WTOKEN") + + +def simplify_date(date_string): + date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S") + return date.strftime("%Y/%m/%d") + + +def download_and_extract_se7z(name: str, directory: str, data_save_dir: str, save_dir_override: str = None): + # Downloading 7z file + if os.path.exists(f"{data_save_dir}/{name}.7z"): + print("Raw 7z data already exists for this dir.") + else: + print("Downloading compressed data.") + + ex_data_url = f"https://archive.org/download/stackexchange/{directory}" + response = requests.get(ex_data_url, allow_redirects=True) + + if response.status_code != 200: + raise ConnectionError(f"Request failed: {response.status_code} for subset: {name}, url: {ex_data_url}") + + print("Unpacking raw data.") + with open(f"{DATA_DIR}/{name}.7z", "wb") as out: + out.write(response.content) + + os.mkdir(f"{DATA_DIR}/{name}") + with py7zr.SevenZipFile(f"{DATA_DIR}/{name}.7z", "r") as archive: + save_dir = save_dir_override if save_dir_override is not None else name + archive.extractall(f"{DATA_DIR}/{save_dir}/") + + print(f"{name} successfully extracted.") + + +def get_question_from_html(exchange): + question = {} + keys_of_interest = ["Id", "Body", "AnswerCount", "OwnerUserId", "PostScore", "Date", "AcceptedAnswerId"] + for key in keys_of_interest: + try: + if key in ["Id", "AnswerCount", "PostScore", "AcceptedAnswerId", "OwnerUserId"]: + question[key] = int(exchange.attrib[key]) + elif key == "Date": + question[key] = simplify_date(exchange.attrib["CreationDate"]) + elif key == "Body": + question[key] = exchange.attrib[key] + question["text"] = BeautifulSoup(exchange.attrib[key], "lxml").text + else: + question[key] = exchange.attrib[key] + except KeyError: + # deleted user redirect to community page > -1 + question[key] = -1 if key == "OwnerUserId" else None + + question["metadata"] = [ + f"https://{se_sub_url}/questions/{str(question['Id'])}", # question URL + f"https://{se_sub_url}", # Exchange URL + f"https://{se_sub_url}/users/{str(question['OwnerUserId'])}/", # Author URL + ] + + return question["Id"], question + + +def get_answer_from_html(exchange): + # We connect answers to their parent's id + parent_id = int(exchange.attrib["ParentId"]) + + answer = {} + keys_of_interest = ["Body", "Score", "Id", "OwnerUserId"] + for key in keys_of_interest: + try: + if key in ["Score", "Id", "OwnerUserId"]: + answer[key] = int(exchange.attrib[key]) + elif key == "Body": + answer[key] = exchange.attrib[key] + answer["text"] = BeautifulSoup(exchange.attrib[key], "lxml").text + else: + answer[key] = exchange.attrib[key] + except KeyError: + answer[key] = -1 if key == "OwnerUserId" else None + + return parent_id, answer + + +def get_posts_from_html(se_sub_name): + extracted_info = defaultdict(lambda: {"question": None, "answers": list()}) + with open(f"{DATA_DIR}/{se_sub_name}/Posts.xml", "rb") as f: + tree = ET.parse(f) + + for exchange in tree.iter("row"): + post_type = int(exchange.attrib["PostTypeId"]) + + if post_type == 1: # Question + if int(exchange.attrib["AnswerCount"]) > 0: + tag, question = get_question_from_html(exchange) + extracted_info[tag]["question"] = question + + elif post_type == 2: # Answer + tag, answer = get_answer_from_html(exchange) + extracted_info[tag]["answers"].append(answer) + return extracted_info + + +def get_jsonlines_from_posts(extracted_info): + result_jsonlines = [] + for tag, data in extracted_info.items(): + # Sorting answers by score (see LLAMA paper), and only keep positively scored ones + question = data["question"] + answers = [a for a in sorted(data["answers"], key=lambda x: x["Score"]) if a["Score"] > 0] + + # We skip empty questions or answers + if question is None or len(answers) < 1: + continue + + text = f"user{question['OwnerUserId']}: {question['text']}" + for answer in answers: + text += f"\nuser{answer['OwnerUserId']}: {answer['text']}" + + result = { + "question_id": question["Id"], + "text": text, + "metadata": question["metadata"], + "date": question["Date"], + "original_text": [f"{item['OwnerUserId']}: {item['Body']}" for item in [question] + answers], + } + result_jsonlines.append(result) + return result_jsonlines + + +def upload_to_hub(result_jsonlines): + size = len(result_jsonlines) + chunk_size = 100000 + if size > chunk_size: + chunks = [ + Dataset.from_list(result_jsonlines[i : min(i + chunk_size, size)]) for i in range(0, size, chunk_size) + ] + dataset = concatenate_datasets(chunks) + else: + dataset = Dataset.from_list(result_jsonlines) + + dataset.push_to_hub("HuggingFaceGECLM/StackExchange_Mar2023", split=se_sub_name, private=True, token=WTOKEN) + + +def main(se_sub_name, se_sub_url): + print(f"{se_sub_name} at {se_sub_url}.") + start_time = time.time() + + # Download and extract + if not os.path.exists(f"{DATA_DIR}/{se_sub_name}/Posts.xml"): + if "se_sub_name" == "stackoverflow": + # Note: we'll also need -Users.7z if we want to filter on licenses at some point + download_and_extract_se7z( + se_sub_name, f"{se_sub_url}-Posts.7z", DATA_DIR, save_dir_override="stackoverflow.com" + ) + else: + download_and_extract_se7z(se_sub_name, f"{se_sub_url}.7z", DATA_DIR) + + # Selects posts from HTML tree (Questions and answers) + extracted_info = get_posts_from_html(se_sub_name) + print("Posts parsed from HTML.") + + # Create json from posts + result_jsonlines = get_jsonlines_from_posts(extracted_info) + + print(f"Finished {se_sub_url} in {time.time() - start_time}s. Contains {len(result_jsonlines)} lines.") + + # Saves to the hub + upload_to_hub(result_jsonlines) + + +if __name__ == "__main__": + os.makedirs(DATA_DIR, exist_ok=True) + + # Process all exchanges in a loop - could be easily launched in parallel + for se_sub_name, se_sub_url in tqdm(ALL_EXCHANGES.items()): + main(se_sub_name, se_sub_url) diff --git a/data_analysis/stackoverflow/other/requirements.txt b/data_analysis/stackoverflow/other/requirements.txt new file mode 100644 index 0000000..80efabe --- /dev/null +++ b/data_analysis/stackoverflow/other/requirements.txt @@ -0,0 +1,5 @@ +datasets +py7zr +requests +tqdm +bs4 \ No newline at end of file diff --git a/data_analysis/stackoverflow/other/se_reference_utils.py b/data_analysis/stackoverflow/other/se_reference_utils.py new file mode 100644 index 0000000..a9cc434 --- /dev/null +++ b/data_analysis/stackoverflow/other/se_reference_utils.py @@ -0,0 +1,347 @@ +ALL_EXCHANGES = { + "3dprinting.meta": "3dprinting.meta.stackexchange.com", + "3dprinting": "3dprinting.stackexchange.com", + "academia.meta": "academia.meta.stackexchange.com", + "academia": "academia.stackexchange.com", + "ai.meta": "ai.meta.stackexchange.com", + "ai": "ai.stackexchange.com", + "android.meta": "android.meta.stackexchange.com", + "android": "android.stackexchange.com", + "anime.meta": "anime.meta.stackexchange.com", + "anime": "anime.stackexchange.com", + "apple.meta": "apple.meta.stackexchange.com", + "apple": "apple.stackexchange.com", + "arduino.meta": "arduino.meta.stackexchange.com", + "arduino": "arduino.stackexchange.com", + "askubuntu": "askubuntu.com", + "astronomy": "astronomy.stackexchange.com", + "astronomy.meta": "astronomy.meta.stackexchange.com", + "aviation": "aviation.stackexchange.com", + "aviation.meta": "aviation.meta.stackexchange.com", + "avp": "avp.stackexchange.com", + "avp.meta": "avp.meta.stackexchange.com", + "beer": "beer.stackexchange.com", + "beer.meta": "beer.meta.stackexchange.com", + "bicycles": "bicycles.stackexchange.com", + "bicycles.meta": "bicycles.meta.stackexchange.com", + "bioinformatics": "bioinformatics.stackexchange.com", + "bioinformatics.meta": "bioinformatics.meta.stackexchange.com", + "biology": "biology.stackexchange.com", + "biology.meta": "biology.meta.stackexchange.com", + "bitcoin": "bitcoin.stackexchange.com", + "bitcoin.meta": "bitcoin.meta.stackexchange.com", + "blender": "blender.stackexchange.com", + "blender.meta": "blender.meta.stackexchange.com", + "boardgames": "boardgames.stackexchange.com", + "boardgames.meta": "boardgames.meta.stackexchange.com", + "bricks": "bricks.stackexchange.com", + "bricks.meta": "bricks.meta.stackexchange.com", + "buddhism": "buddhism.stackexchange.com", + "buddhism.meta": "buddhism.meta.stackexchange.com", + "cardano": "cardano.stackexchange.com", + "cardano.meta": "cardano.meta.stackexchange.com", + "chemistry": "chemistry.stackexchange.com", + "chemistry.meta": "chemistry.meta.stackexchange.com", + "chess": "chess.stackexchange.com", + "chess.meta": "chess.meta.stackexchange.com", + "chinese": "chinese.stackexchange.com", + "chinese.meta": "chinese.meta.stackexchange.com", + "christianity": "christianity.stackexchange.com", + "christianity.meta": "christianity.meta.stackexchange.com", + "civicrm": "civicrm.stackexchange.com", + "civicrm.meta": "civicrm.meta.stackexchange.com", + "codegolf": "codegolf.stackexchange.com", + "codegolf.meta": "codegolf.meta.stackexchange.com", + "codereview": "codereview.stackexchange.com", + "codereview.meta": "codereview.meta.stackexchange.com", + "coffee": "coffee.stackexchange.com", + "coffee.meta": "coffee.meta.stackexchange.com", + "cogsci": "cogsci.stackexchange.com", + "cogsci.meta": "cogsci.meta.stackexchange.com", + "computergraphics": "computergraphics.stackexchange.com", + "computergraphics.meta": "computergraphics.meta.stackexchange.com", + "conlang": "conlang.stackexchange.com", + "conlang.meta": "conlang.meta.stackexchange.com", + "cooking": "cooking.stackexchange.com", + "cooking.meta": "cooking.meta.stackexchange.com", + "craftcms": "craftcms.stackexchange.com", + "craftcms.meta": "craftcms.meta.stackexchange.com", + "crafts": "crafts.stackexchange.com", + "crafts.meta": "crafts.meta.stackexchange.com", + "crypto": "crypto.stackexchange.com", + "crypto.meta": "crypto.meta.stackexchange.com", + "cs": "cs.stackexchange.com", + "cs.meta": "cs.meta.stackexchange.com", + "cseducators": "cseducators.stackexchange.com", + "cseducators.meta": "cseducators.meta.stackexchange.com", + "cstheory": "cstheory.stackexchange.com", + "cstheory.meta": "cstheory.meta.stackexchange.com", + "datascience": "datascience.stackexchange.com", + "datascience.meta": "datascience.meta.stackexchange.com", + "dba": "dba.stackexchange.com", + "dba.meta": "dba.meta.stackexchange.com", + "devops": "devops.stackexchange.com", + "devops.meta": "devops.meta.stackexchange.com", + "diy": "diy.stackexchange.com", + "diy.meta": "diy.meta.stackexchange.com", + "drones": "drones.stackexchange.com", + "drones.meta": "drones.meta.stackexchange.com", + "drupal": "drupal.stackexchange.com", + "drupal.meta": "drupal.meta.stackexchange.com", + "dsp": "dsp.stackexchange.com", + "dsp.meta": "dsp.meta.stackexchange.com", + "earthscience": "earthscience.stackexchange.com", + "earthscience.meta": "earthscience.meta.stackexchange.com", + "ebooks": "ebooks.stackexchange.com", + "ebooks.meta": "ebooks.meta.stackexchange.com", + "economics": "economics.stackexchange.com", + "economics.meta": "economics.meta.stackexchange.com", + "electronics": "electronics.stackexchange.com", + "electronics.meta": "electronics.meta.stackexchange.com", + "elementaryos": "elementaryos.stackexchange.com", + "elementaryos.meta": "elementaryos.meta.stackexchange.com", + "ell": "ell.stackexchange.com", + "ell.meta": "ell.meta.stackexchange.com", + "emacs": "emacs.stackexchange.com", + "emacs.meta": "emacs.meta.stackexchange.com", + "engineering": "engineering.stackexchange.com", + "engineering.meta": "engineering.meta.stackexchange.com", + "english": "english.stackexchange.com", + "english.meta": "english.meta.stackexchange.com", + "eosio": "eosio.stackexchange.com", + "eosio.meta": "eosio.meta.stackexchange.com", + "esperanto": "esperanto.stackexchange.com", + "esperanto.meta": "esperanto.meta.stackexchange.com", + "ethereum": "ethereum.stackexchange.com", + "ethereum.meta": "ethereum.meta.stackexchange.com", + "expatriates": "expatriates.stackexchange.com", + "expatriates.meta": "expatriates.meta.stackexchange.com", + "expressionengine": "expressionengine.stackexchange.com", + "expressionengine.meta": "expressionengine.meta.stackexchange.com", + "fitness": "fitness.stackexchange.com", + "fitness.meta": "fitness.meta.stackexchange.com", + "freelancing": "freelancing.stackexchange.com", + "freelancing.meta": "freelancing.meta.stackexchange.com", + "french": "french.stackexchange.com", + "french.meta": "french.meta.stackexchange.com", + "gamedev": "gamedev.stackexchange.com", + "gamedev.meta": "gamedev.meta.stackexchange.com", + "gaming": "gaming.stackexchange.com", + "gaming.meta": "gaming.meta.stackexchange.com", + "gardening": "gardening.stackexchange.com", + "gardening.meta": "gardening.meta.stackexchange.com", + "genealogy": "genealogy.stackexchange.com", + "genealogy.meta": "genealogy.meta.stackexchange.com", + "german": "german.stackexchange.com", + "german.meta": "german.meta.stackexchange.com", + "gis": "gis.stackexchange.com", + "gis.meta": "gis.meta.stackexchange.com", + "graphicdesign": "graphicdesign.stackexchange.com", + "graphicdesign.meta": "graphicdesign.meta.stackexchange.com", + "ham": "ham.stackexchange.com", + "ham.meta": "ham.meta.stackexchange.com", + "hardwarerecs": "hardwarerecs.stackexchange.com", + "hardwarerecs.meta": "hardwarerecs.meta.stackexchange.com", + "health": "health.stackexchange.com", + "health.meta": "health.meta.stackexchange.com", + "hermeneutics": "hermeneutics.stackexchange.com", + "hermeneutics.meta": "hermeneutics.meta.stackexchange.com", + "hinduism": "hinduism.stackexchange.com", + "hinduism.meta": "hinduism.meta.stackexchange.com", + "history": "history.stackexchange.com", + "history.meta": "history.meta.stackexchange.com", + "homebrew": "homebrew.stackexchange.com", + "homebrew.meta": "homebrew.meta.stackexchange.com", + "hsm": "hsm.stackexchange.com", + "hsm.meta": "hsm.meta.stackexchange.com", + "interpersonal": "interpersonal.stackexchange.com", + "interpersonal.meta": "interpersonal.meta.stackexchange.com", + "iot": "iot.stackexchange.com", + "iot.meta": "iot.meta.stackexchange.com", + "iota": "iota.stackexchange.com", + "iota.meta": "iota.meta.stackexchange.com", + "islam": "islam.stackexchange.com", + "islam.meta": "islam.meta.stackexchange.com", + "italian": "italian.stackexchange.com", + "italian.meta": "italian.meta.stackexchange.com", + "japanese": "japanese.stackexchange.com", + "japanese.meta": "japanese.meta.stackexchange.com", + "joomla": "joomla.stackexchange.com", + "joomla.meta": "joomla.meta.stackexchange.com", + "judaism": "judaism.stackexchange.com", + "judaism.meta": "judaism.meta.stackexchange.com", + "korean": "korean.stackexchange.com", + "korean.meta": "korean.meta.stackexchange.com", + "languagelearning": "languagelearning.stackexchange.com", + "languagelearning.meta": "languagelearning.meta.stackexchange.com", + "latin": "latin.stackexchange.com", + "latin.meta": "latin.meta.stackexchange.com", + "law": "law.stackexchange.com", + "law.meta": "law.meta.stackexchange.com", + "lifehacks": "lifehacks.stackexchange.com", + "lifehacks.meta": "lifehacks.meta.stackexchange.com", + "linguistics": "linguistics.stackexchange.com", + "linguistics.meta": "linguistics.meta.stackexchange.com", + "literature": "literature.stackexchange.com", + "literature.meta": "literature.meta.stackexchange.com", + "magento": "magento.stackexchange.com", + "magento.meta": "magento.meta.stackexchange.com", + "martialarts": "martialarts.stackexchange.com", + "martialarts.meta": "martialarts.meta.stackexchange.com", + "materials": "materials.stackexchange.com", + "materials.meta": "materials.meta.stackexchange.com", + "math": "math.stackexchange.com", + "math.meta": "math.meta.stackexchange.com", + "matheducators": "matheducators.stackexchange.com", + "matheducators.meta": "matheducators.meta.stackexchange.com", + "mathematica": "mathematica.stackexchange.com", + "mathematica.meta": "mathematica.meta.stackexchange.com", + "mathoverflow": "mathoverflow.net", + "mechanics.meta": "mechanics.meta.stackexchange.com", + "mechanics": "mechanics.stackexchange.com", + "meta.askubuntu": "meta.askubuntu.com", + "meta.mathoverflow": "meta.mathoverflow.net", + "meta.serverfault": "meta.serverfault.com", + "meta.stackexchange": "meta.stackexchange.com", + "meta.stackoverflow": "meta.stackoverflow.com", + "meta.superuser": "meta.superuser.com", + "moderators.meta": "moderators.meta.stackexchange.com", + "moderators": "moderators.stackexchange.com", + "monero.meta": "monero.meta.stackexchange.com", + "monero": "monero.stackexchange.com", + "money.meta": "money.meta.stackexchange.com", + "money": "money.stackexchange.com", + "movies.meta": "movies.meta.stackexchange.com", + "movies": "movies.stackexchange.com", + "music.meta": "music.meta.stackexchange.com", + "music": "music.stackexchange.com", + "musicfans.meta": "musicfans.meta.stackexchange.com", + "musicfans": "musicfans.stackexchange.com", + "mythology.meta": "mythology.meta.stackexchange.com", + "mythology": "mythology.stackexchange.com", + "networkengineering.meta": "networkengineering.meta.stackexchange.com", + "networkengineering": "networkengineering.stackexchange.com", + "opendata.meta": "opendata.meta.stackexchange.com", + "opendata": "opendata.stackexchange.com", + "opensource.meta": "opensource.meta.stackexchange.com", + "opensource": "opensource.stackexchange.com", + "or.meta": "or.meta.stackexchange.com", + "or": "or.stackexchange.com", + "outdoors.meta": "outdoors.meta.stackexchange.com", + "outdoors": "outdoors.stackexchange.com", + "parenting.meta": "parenting.meta.stackexchange.com", + "parenting": "parenting.stackexchange.com", + "patents.meta": "patents.meta.stackexchange.com", + "patents": "patents.stackexchange.com", + "pets.meta": "pets.meta.stackexchange.com", + "pets": "pets.stackexchange.com", + "philosophy.meta": "philosophy.meta.stackexchange.com", + "philosophy": "philosophy.stackexchange.com", + "photo.meta": "photo.meta.stackexchange.com", + "photo": "photo.stackexchange.com", + "physics.meta": "physics.meta.stackexchange.com", + "physics": "physics.stackexchange.com", + "pm.meta": "pm.meta.stackexchange.com", + "pm": "pm.stackexchange.com", + "poker.meta": "poker.meta.stackexchange.com", + "poker": "poker.stackexchange.com", + "politics.meta": "politics.meta.stackexchange.com", + "politics": "politics.stackexchange.com", + "portuguese.meta": "portuguese.meta.stackexchange.com", + "portuguese": "portuguese.stackexchange.com", + "puzzling.meta": "puzzling.meta.stackexchange.com", + "puzzling": "puzzling.stackexchange.com", + "quant.meta": "quant.meta.stackexchange.com", + "quant": "quant.stackexchange.com", + "quantumcomputing.meta": "quantumcomputing.meta.stackexchange.com", + "quantumcomputing": "quantumcomputing.stackexchange.com", + "raspberrypi.meta": "raspberrypi.meta.stackexchange.com", + "raspberrypi": "raspberrypi.stackexchange.com", + "retrocomputing.meta": "retrocomputing.meta.stackexchange.com", + "retrocomputing": "retrocomputing.stackexchange.com", + "reverseengineering.meta": "reverseengineering.meta.stackexchange.com", + "reverseengineering": "reverseengineering.stackexchange.com", + "robotics.meta": "robotics.meta.stackexchange.com", + "robotics": "robotics.stackexchange.com", + "rpg.meta": "rpg.meta.stackexchange.com", + "rpg": "rpg.stackexchange.com", + "rus.meta": "rus.meta.stackexchange.com", + "rus": "rus.stackexchange.com", + "russian.meta": "russian.meta.stackexchange.com", + "russian": "russian.stackexchange.com", + "salesforce.meta": "salesforce.meta.stackexchange.com", + "salesforce": "salesforce.stackexchange.com", + "scicomp.meta": "scicomp.meta.stackexchange.com", + "scicomp": "scicomp.stackexchange.com", + "scifi.meta": "scifi.meta.stackexchange.com", + "scifi": "scifi.stackexchange.com", + "security.meta": "security.meta.stackexchange.com", + "security": "security.stackexchange.com", + "serverfault": "serverfault.com", + "sharepoint": "sharepoint.stackexchange.com", + "sharepoint.meta": "sharepoint.meta.stackexchange.com", + "sitecore": "sitecore.stackexchange.com", + "sitecore.meta": "sitecore.meta.stackexchange.com", + "skeptics": "skeptics.stackexchange.com", + "skeptics.meta": "skeptics.meta.stackexchange.com", + "softwareengineering": "softwareengineering.stackexchange.com", + "softwareengineering.meta": "softwareengineering.meta.stackexchange.com", + "softwarerecs": "softwarerecs.stackexchange.com", + "softwarerecs.meta": "softwarerecs.meta.stackexchange.com", + "sound": "sound.stackexchange.com", + "sound.meta": "sound.meta.stackexchange.com", + "space": "space.stackexchange.com", + "space.meta": "space.meta.stackexchange.com", + "spanish": "spanish.stackexchange.com", + "spanish.meta": "spanish.meta.stackexchange.com", + "sports": "sports.stackexchange.com", + "sports.meta": "sports.meta.stackexchange.com", + "sqa": "sqa.stackexchange.com", + "sqa.meta": "sqa.meta.stackexchange.com", + "stackapps": "stackapps.com", + # "stackexchange": "stackexchange.com", + "stats.meta": "stats.meta.stackexchange.com", + "stats": "stats.stackexchange.com", + "stellar.meta": "stellar.meta.stackexchange.com", + "stellar": "stellar.stackexchange.com", + "superuser": "superuser.com", + "sustainability": "sustainability.stackexchange.com", + "sustainability.meta": "sustainability.meta.stackexchange.com", + "tex": "tex.stackexchange.com", + "tex.meta": "tex.meta.stackexchange.com", + "tezos": "tezos.stackexchange.com", + "tezos.meta": "tezos.meta.stackexchange.com", + "tor": "tor.stackexchange.com", + "tor.meta": "tor.meta.stackexchange.com", + "travel": "travel.stackexchange.com", + "travel.meta": "travel.meta.stackexchange.com", + "tridion": "tridion.stackexchange.com", + "tridion.meta": "tridion.meta.stackexchange.com", + "ukrainian": "ukrainian.stackexchange.com", + "ukrainian.meta": "ukrainian.meta.stackexchange.com", + "unix": "unix.stackexchange.com", + "unix.meta": "unix.meta.stackexchange.com", + "ux": "ux.stackexchange.com", + "ux.meta": "ux.meta.stackexchange.com", + "vegetarianism": "vegetarianism.stackexchange.com", + "vegetarianism.meta": "vegetarianism.meta.stackexchange.com", + "vi": "vi.stackexchange.com", + "vi.meta": "vi.meta.stackexchange.com", + "webapps": "webapps.stackexchange.com", + "webapps.meta": "webapps.meta.stackexchange.com", + "webmasters": "webmasters.stackexchange.com", + "webmasters.meta": "webmasters.meta.stackexchange.com", + "windowsphone": "windowsphone.stackexchange.com", + "windowsphone.meta": "windowsphone.meta.stackexchange.com", + "woodworking": "woodworking.stackexchange.com", + "woodworking.meta": "woodworking.meta.stackexchange.com", + "wordpress": "wordpress.stackexchange.com", + "wordpress.meta": "wordpress.meta.stackexchange.com", + "workplace": "workplace.stackexchange.com", + "workplace.meta": "workplace.meta.stackexchange.com", + "worldbuilding": "worldbuilding.stackexchange.com", + "worldbuilding.meta": "worldbuilding.meta.stackexchange.com", + "writers": "writers.stackexchange.com", + "writers.meta": "writers.meta.stackexchange.com", + "stackoverflow": "stackoverflow.com", +} From 58dde1740f619530b0e2d36d66c26f5740ce4119 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Wed, 20 Sep 2023 15:36:58 +0200 Subject: [PATCH 2/5] add code --- data_analysis/stackoverflow/README.md | 3 +++ data_analysis/stackoverflow/h4_code/README.md | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 data_analysis/stackoverflow/README.md diff --git a/data_analysis/stackoverflow/README.md b/data_analysis/stackoverflow/README.md new file mode 100644 index 0000000..089cc28 --- /dev/null +++ b/data_analysis/stackoverflow/README.md @@ -0,0 +1,3 @@ +## Code for processing StackExchange data + +Code for processing stackexchange data dump available in `h4_code` (to build https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) and `other`, notebook for further processing (e.g convert all HTML to Markdown) in `StackExchangeProcessing.ipynb` (to build https://huggingface.co/datasets/lvwerra/stack-exchange-paired) diff --git a/data_analysis/stackoverflow/h4_code/README.md b/data_analysis/stackoverflow/h4_code/README.md index e918771..d3e3ed4 100644 --- a/data_analysis/stackoverflow/h4_code/README.md +++ b/data_analysis/stackoverflow/h4_code/README.md @@ -20,7 +20,7 @@ It is a long for-loop over desired exchanges. python scripts/data/pmp/stack_exchange_process.py --save_path=/path/to/hf-dataset ``` -3) `binarize.py`: used to binarize the pre-filter Stack Exchange data (and in the future, Reddit / Wikipedia) +3) `binarize.py`: used to binarize the pre-filter Stack Exchange data ```shell python scripts/data/pmp/binarize.py --save_path=/path/to/hf-dataset ``` \ No newline at end of file From 3ef2a868d7ccd6d9bb699c0e9a63a142f405eb2e Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Wed, 20 Sep 2023 15:38:27 +0200 Subject: [PATCH 3/5] add notebook --- .../StackExchangeProcessing.ipynb | 718 ++++++++++++++++++ 1 file changed, 718 insertions(+) create mode 100644 data_analysis/stackoverflow/StackExchangeProcessing.ipynb diff --git a/data_analysis/stackoverflow/StackExchangeProcessing.ipynb b/data_analysis/stackoverflow/StackExchangeProcessing.ipynb new file mode 100644 index 0000000..d37df68 --- /dev/null +++ b/data_analysis/stackoverflow/StackExchangeProcessing.ipynb @@ -0,0 +1,718 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 86, + "id": "7821c501-8c5d-4af6-81cd-caa6ad0bd58c", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, DatasetDict\n", + "from datasets import concatenate_datasets\n", + "from IPython.display import HTML\n", + "\n", + "from tqdm import tqdm\n", + "import re \n", + "import numpy as np\n", + "from markdownify import markdownify as md" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "dc821970-efdb-407f-bd79-59da09323280", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/home/leandro/.cache/huggingface/datasets/HuggingFaceH4___parquet/HuggingFaceH4--stack-exchange-preferences-1d2bff9ecb5ffe2a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['qid', 'question', 'answers', 'date', 'metadata'],\n", + " num_rows: 10807695\n", + "})" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"HuggingFaceH4/stack-exchange-preferences\", split=\"train\", num_proc=16)\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "0d8d8729-6d6b-4791-a24a-cb112c399bd0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

I have been wanting to learn about 3D printing a long time so I really want this site to succeed but I have no previous experience with the subject.

\n", + "\n", + "

I was wondering how can I help the site at this early stage. I thought about asking about how to get started with 3D printing but SE explicitly discourages \"easy\" questions in the private beta.

\n", + "\n", + "

What can newbies like me do for the site at this stage besides voting questions and answers?

\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HTML(ds[0][\"question\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "b3b60caa-3bd9-4033-ab1c-90c5b08ef3ec", + "metadata": {}, + "outputs": [], + "source": [ + "def lang_callback(el):\n", + " lang = el['class'][0] if el.has_attr('class') else None\n", + " \n", + " if not lang is None:\n", + " lang = lang.split(\"-\")[-1]\n", + " return lang" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "de1123a0-7468-4d13-a8d3-4011ace36c3c", + "metadata": {}, + "outputs": [], + "source": [ + "def html2md(text):\n", + " text = md(text, code_language_callback=lang_callback)\n", + " text = re.sub(r\"\\n\\s*\\n\", \"\\n\\n\", text).strip()\n", + " return text.encode('utf-8', 'replace').decode()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "c9da64a0-c753-4d35-9369-b70a7a9fa2f9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I have been wanting to learn about 3D printing a long time so I really want this site to succeed but I have no previous experience with the subject. \n", + "\n", + "I was wondering how can I help the site at this early stage. I thought about asking about how to get started with 3D printing but SE explicitly discourages \"easy\" questions in the private beta.\n", + "\n", + "What can newbies like me do for the site at this stage besides voting questions and answers?\n", + "====================\n" + ] + } + ], + "source": [ + "for i in range(1):\n", + " text = html2md(ds[i][\"question\"])\n", + " print(text)\n", + " print(\"==\"*10)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "3bf33a2f-fed5-49e7-8046-e813ad172b17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "49.935" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean([len(ds[i][\"answers\"])*(len(ds[i][\"answers\"])-1)/2 for i in range(10000)])" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "88ea2dd5-b885-4f65-bae3-1319c7816044", + "metadata": {}, + "outputs": [], + "source": [ + "ds = ds.shuffle(seed=42)\n", + "index = list(range(len(ds)))\n", + "\n", + "ds_splits = DatasetDict({\n", + " \"finetune\": ds.select(index[:3_000_000]),\n", + " \"reward\": ds.select(index[3_000_000:6_000_000]),\n", + " \"rl\": ds.select(index[6_000_000:9_000_000]),\n", + " \"evaluation\": ds.select(index[9_000_000:]),\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "1607922d-f585-4de7-be70-2205b5170102", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " finetune: Dataset({\n", + " features: ['qid', 'question', 'answers', 'date', 'metadata'],\n", + " num_rows: 3000000\n", + " })\n", + " reward: Dataset({\n", + " features: ['qid', 'question', 'answers', 'date', 'metadata'],\n", + " num_rows: 3000000\n", + " })\n", + " rl: Dataset({\n", + " features: ['qid', 'question', 'answers', 'date', 'metadata'],\n", + " num_rows: 3000000\n", + " })\n", + " evaluation: Dataset({\n", + " features: ['qid', 'question', 'answers', 'date', 'metadata'],\n", + " num_rows: 1807695\n", + " })\n", + "})" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_splits" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "edc8af18-94a5-49e9-ae73-ce4ba81d9739", + "metadata": {}, + "outputs": [], + "source": [ + "def binary_comparison(answers):\n", + " \"\"\"Returns tuples of answers, first always best\"\"\"\n", + " pairs = []\n", + " \n", + " for i in range(len(answers)-1):\n", + " for j in range(i+1, len(answers)):\n", + " if answers[i][\"pm_score\"]>answers[j][\"pm_score\"]:\n", + " pairs.append((answers[i][\"text\"], answers[j][\"text\"]))\n", + " elif answers[i][\"pm_score\"] MAX_PAIRS_PER_QUESTION:\n", + " indices = np.random.choice(list(range(len(pairs))), MAX_PAIRS_PER_QUESTION, replace=False)\n", + " pairs = [pairs[i] for i in indices]\n", + " \n", + " # construct the samples\n", + " for pair in pairs:\n", + " for key in examples:\n", + " if key==\"question\":\n", + " new_examples[key].append(html2md(examples[key][sample_id]))\n", + " else:\n", + " new_examples[key].append(examples[key][sample_id])\n", + " new_examples[\"response_j\"].append(html2md(pair[0]))\n", + " new_examples[\"response_k\"].append(html2md(pair[1]))\n", + " return new_examples" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "ac06aac5-3953-4321-9f1e-6ff210bee82d", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map (num_proc=60): 0%| | 0/3000000 [00:00\\n\\nNow it says this\\n\\n```\\nCallback for successful upload requests.\\n$('#fileupload')\\n .bind('fileuploaddone', function (e, data) {/* ... */})\\n\\n```\\n\\nNow I have defined this custom function for testing in my own js file\\n\\n```\\n$('#fileupload').bind('fileuploaddone', function (e, data) {/* ... */\\nalert('Hello');\\n})\\n\\n```\\n\\nBut it's not working.\\n\\nBut if I edit the main file in here\\n\\n```\\n // Callback for successful uploads:\\n done: function (e, data) {\\n\\n```\\n\\nThen it works.\",\n", + " 'answers': [{'answer_id': 12891484,\n", + " 'author': 'Reflective',\n", + " 'author_id': 1686626,\n", + " 'author_profile': 'https://Stackoverflow.com/users/1686626',\n", + " 'pm_score': 4,\n", + " 'selected': True,\n", + " 'text': \"

Looking at the library code, seems all events are renamed removing 'fileupload' ... so 'fileuploaddone' becomes just 'done'. It is valid for all other callbacks.\\nlook at this section:

\\n\\n
    // Other callbacks:\\n    // Callback for the submit event of each file upload:\\n    // submit: function (e, data) {}, // .bind('fileuploadsubmit', func);\\n    // Callback for the start of each file upload request:\\n    // send: function (e, data) {}, // .bind('fileuploadsend', func);\\n    // Callback for successful uploads:\\n    // done: function (e, data) {}, // .bind('fileuploaddone', func);\\n    // Callback for failed (abort or error) uploads:\\n    // fail: function (e, data) {}, // .bind('fileuploadfail', func);\\n    // Callback for completed (success, abort or error) requests:\\n    // always: function (e, data) {}, // .bind('fileuploadalways', func);\\n    // Callback for upload progress events:\\n    // progress: function (e, data) {}, // .bind('fileuploadprogress', func);\\n    // Callback for global upload progress events:\\n    // progressall: function (e, data) {}, // .bind('fileuploadprogressall', func);\\n    // Callback for uploads start, equivalent to the global ajaxStart event:\\n    // start: function (e) {}, // .bind('fileuploadstart', func);\\n    // Callback for uploads stop, equivalent to the global ajaxStop event:\\n    // stop: function (e) {}, // .bind('fileuploadstop', func);\\n    // Callback for change events of the fileInput(s):\\n    // change: function (e, data) {}, // .bind('fileuploadchange', func);\\n    // Callback for paste events to the pasteZone(s):\\n    // paste: function (e, data) {}, // .bind('fileuploadpaste', func);\\n    // Callback for drop events of the dropZone(s):\\n    // drop: function (e, data) {}, // .bind('fileuploaddrop', func);\\n    // Callback for dragover events of the dropZone(s):\\n    // dragover: function (e) {}, // .bind('fileuploaddragover', func);\\n
\\n\\n

If you have some doubts about what's happening, just look at the code inside. This library is not compressed so it is easy to see. for example

\\n\\n
// start: function (e) {}, // .bind('fileuploadstart', func);\\n
\\n\\n

start callback is implemented. fileuploadstart is not.

\\n\"},\n", + " {'answer_id': 15419140,\n", + " 'author': 'NXT',\n", + " 'author_id': 1554649,\n", + " 'author_profile': 'https://Stackoverflow.com/users/1554649',\n", + " 'pm_score': 3,\n", + " 'selected': False,\n", + " 'text': '

Check if the server-side uploading script returns a JSON reply - in my case it didn\\'t work when the reply was empty, but file was uploaded successfully.

\\n\\n

So, below is working for me with jQuery 1.9.1 and the newest version of the \"jQuery File Upload Plugin\" - 5.21.3

\\n\\n
$(\"#fileupload\").bind(\"fileuploaddone\", function (e, data) {\\n    console.log(\"fileuploaddone event fired\");\\n});\\n
\\n'}],\n", + " 'date': '2012/10/15',\n", + " 'metadata': ['https://Stackoverflow.com/questions/12891264',\n", + " 'https://Stackoverflow.com',\n", + " 'https://Stackoverflow.com/users/767244/'],\n", + " 'response_j': \"Looking at the library code, seems all events are renamed removing 'fileupload' ... so 'fileuploaddone' becomes just 'done'. It is valid for all other callbacks.\\nlook at this section:\\n\\n```\\n // Other callbacks:\\n // Callback for the submit event of each file upload:\\n // submit: function (e, data) {}, // .bind('fileuploadsubmit', func);\\n // Callback for the start of each file upload request:\\n // send: function (e, data) {}, // .bind('fileuploadsend', func);\\n // Callback for successful uploads:\\n // done: function (e, data) {}, // .bind('fileuploaddone', func);\\n // Callback for failed (abort or error) uploads:\\n // fail: function (e, data) {}, // .bind('fileuploadfail', func);\\n // Callback for completed (success, abort or error) requests:\\n // always: function (e, data) {}, // .bind('fileuploadalways', func);\\n // Callback for upload progress events:\\n // progress: function (e, data) {}, // .bind('fileuploadprogress', func);\\n // Callback for global upload progress events:\\n // progressall: function (e, data) {}, // .bind('fileuploadprogressall', func);\\n // Callback for uploads start, equivalent to the global ajaxStart event:\\n // start: function (e) {}, // .bind('fileuploadstart', func);\\n // Callback for uploads stop, equivalent to the global ajaxStop event:\\n // stop: function (e) {}, // .bind('fileuploadstop', func);\\n // Callback for change events of the fileInput(s):\\n // change: function (e, data) {}, // .bind('fileuploadchange', func);\\n // Callback for paste events to the pasteZone(s):\\n // paste: function (e, data) {}, // .bind('fileuploadpaste', func);\\n // Callback for drop events of the dropZone(s):\\n // drop: function (e, data) {}, // .bind('fileuploaddrop', func);\\n // Callback for dragover events of the dropZone(s):\\n // dragover: function (e) {}, // .bind('fileuploaddragover', func);\\n\\n```\\n\\nIf you have some doubts about what's happening, just look at the code inside. This library is not compressed so it is easy to see. for example\\n\\n```\\n// start: function (e) {}, // .bind('fileuploadstart', func);\\n\\n```\\n\\n`start` callback is implemented. `fileuploadstart` is not.\",\n", + " 'response_k': 'Check if the server-side uploading script returns a JSON reply - in my case it didn\\'t work when the reply was empty, but file was uploaded successfully.\\n\\nSo, below is working for me with jQuery 1.9.1 and the newest version of the \"jQuery File Upload Plugin\" - 5.21.3\\n\\n```\\n$(\"#fileupload\").bind(\"fileuploaddone\", function (e, data) {\\n console.log(\"fileuploaddone event fired\");\\n});\\n\\n```'}" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_result[\"finetune\"][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "2c96653b-7a5a-4cae-a327-b6aa77aa5850", + "metadata": {}, + "outputs": [], + "source": [ + "ds_result = ds_result.remove_columns([\"answers\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "15c2e5ee-7c7d-4e98-9e63-e5d37a9354aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " finetune: Dataset({\n", + " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n", + " num_rows: 7440923\n", + " })\n", + " reward: Dataset({\n", + " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n", + " num_rows: 7441998\n", + " })\n", + " rl: Dataset({\n", + " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n", + " num_rows: 7435908\n", + " })\n", + " evaluation: Dataset({\n", + " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n", + " num_rows: 4483004\n", + " })\n", + "})" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_result" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "4d42b35c-5252-4b49-ba4b-20818bc9e086", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "finetune\n", + "reward\n", + "rl\n", + "evaluation\n" + ] + } + ], + "source": [ + "for key in ds_result:\n", + " print(key)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "e32c11d7-a88e-4d92-9dfc-92b2a67c5455", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "from multiprocessing import Pool\n", + "from tqdm import tqdm\n", + "\n", + "from huggingface_hub import Repository\n", + "\n", + "\n", + "def save_shard(shard_tuple):\n", + " \"\"\"Save shard\"\"\"\n", + " filename, shard = shard_tuple\n", + " # use to_json instead to save as json file\n", + " shard.to_parquet(filename)\n", + "\n", + "\n", + "def save_manual_shards(ds, user=\"lvwerra\", remote_dataset_repo=\"stack-exchange-paired\", subfolder=\"train\"):\n", + " \"\"\"Save sharded data\n", + " Args:\n", + " ds (Dataset): dataset to be saved\n", + " user (str): user name\n", + " remote_dataset_repo (str): remote dataset repository\n", + " out_path (str): path to save the shards\"\"\"\n", + " # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO\n", + " # you can save the shards inside it and do git add/commit/push to push data to the hub\n", + " out_path = remote_dataset_repo\n", + " # if out path doesnt already exist\n", + " if not os.path.exists(out_path):\n", + " repo = Repository(\n", + " local_dir=out_path,\n", + " clone_from=user + \"/\" + remote_dataset_repo,\n", + " repo_type=\"dataset\",\n", + " private=False,\n", + " use_auth_token=True,\n", + " git_user=user,\n", + " )\n", + "\n", + " # files will be numerous we save them in a folder called data inside out_path\n", + " if not os.path.exists(out_path):\n", + " os.mkdir(out_path + \"/data\")\n", + " os.mkdir(out_path + f\"/data/{subfolder}\")\n", + " \n", + " SHARD_SIZE = 1000 << 20\n", + " if ds._indices is not None:\n", + " dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)\n", + " else:\n", + " dataset_nbytes = ds.data.nbytes\n", + " num_shards = int(dataset_nbytes / SHARD_SIZE) + 1\n", + " print(f\"Number of shards: {num_shards}\")\n", + "\n", + " print(\"sharding the dataset\")\n", + " t_start = time.time()\n", + " shards = (\n", + " ds.shard(num_shards=num_shards, index=i, contiguous=True)\n", + " for i in range(num_shards)\n", + " )\n", + " # use f\"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json\" instead for json files\n", + " filenames = (\n", + " f\"{out_path}/data/{subfolder}/train-{index:05d}-of-{num_shards:05d}.parquet\"\n", + " for index in range(num_shards)\n", + " )\n", + "\n", + " with Pool(16) as p:\n", + " list(\n", + " tqdm(\n", + " p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),\n", + " total=num_shards,\n", + " )\n", + " )\n", + " print(f\"Time to save dataset: {time.time()-t_start:.2f}\")\n", + " # to push dataset to hub do: git add/commit/push inside OUT_PATH" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "a90664eb-5c54-4fae-9a8a-d509bb2abdfe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of shards: 20\n", + "sharding the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:28<00:00, 1.43s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to save dataset: 29.15\n", + "Number of shards: 20\n", + "sharding the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00, 1.15s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to save dataset: 23.42\n", + "Number of shards: 20\n", + "sharding the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:10<00:00, 1.83it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to save dataset: 11.36\n", + "Number of shards: 12\n", + "sharding the dataset\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:10<00:00, 1.12it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Time to save dataset: 11.13\n" + ] + } + ], + "source": [ + "for key in ds_result:\n", + " save_manual_shards(ds_result[key], subfolder=key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d62f5a7f-2a23-4e0d-9e49-b29f88ea8c13", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e670afadf029d54192df72e6534ecb0aad4610aa Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Thu, 21 Sep 2023 13:54:59 +0200 Subject: [PATCH 4/5] add notebook --- .../pull-requests/reconstruct_prs.ipynb | 5043 +++++++++++++++++ 1 file changed, 5043 insertions(+) create mode 100644 data_analysis/pull-requests/reconstruct_prs.ipynb diff --git a/data_analysis/pull-requests/reconstruct_prs.ipynb b/data_analysis/pull-requests/reconstruct_prs.ipynb new file mode 100644 index 0000000..9bc724c --- /dev/null +++ b/data_analysis/pull-requests/reconstruct_prs.ipynb @@ -0,0 +1,5043 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: python-dateutil in /Users/loubnabenallal/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /Users/loubnabenallal/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages (from python-dateutil) (1.16.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install python-dateutil" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading readme: 100%|██████████| 8.02k/8.02k [00:00<00:00, 1.52MB/s]\n" + ] + } + ], + "source": [ + "import json\n", + "import pandas as pd\n", + "from dateutil.parser import parse\n", + "from datasets import load_dataset, Dataset\n", + "\n", + "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 330, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "from dateutil.parser import parse\n", + "from datasets import load_dataset, Dataset\n", + "\n", + "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n", + "\n", + "size = 500_000\n", + "\n", + "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n", + "\n", + "# 10k subset of random samples from ds\n", + "fianl_ds = list(ds.take(size))\n", + "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))" + ] + }, + { + "cell_type": "code", + "execution_count": 332, + "metadata": {}, + "outputs": [], + "source": [ + "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))" + ] + }, + { + "cell_type": "code", + "execution_count": 365, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['pull_request.guid', 'pull_request.code_review_events', 'pull_request.events', 'pull_request.issue_events', 'bucket', '__index_level_0__'],\n", + " num_rows: 500000\n", + "})" + ] + }, + "execution_count": 365, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import pandas as pd\n", + "from dateutil.parser import parse\n", + "from datasets import load_dataset, Dataset\n", + "\n", + "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n", + "\n", + "size = 500_000\n", + "\n", + "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n", + "\n", + "# 10k subset of random samples from ds\n", + "fianl_ds = list(ds.take(size))\n", + "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))" + ] + }, + { + "cell_type": "code", + "execution_count": 335, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'__index_level_0__': 46164,\n", + " 'bucket': None,\n", + " 'pull_request.code_review_events': None,\n", + " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n", + " '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n", + " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", + " 'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n", + " '158894695, \"public\": true, \"created_at\": '\n", + " '\"2020-11-23T05:58:40Z\", \"org.id\": null, \"org.login\": '\n", + " 'null, \"pull_request.id\": 525472638, '\n", + " '\"pull_request.number\": 2460, \"pull_request.state\": '\n", + " '\"open\", \"pull_request.title\": \"[pull] master from '\n", + " 'kubernetes:master\", \"pull_request.body\": \"See Commits '\n", + " 'and Changes for more details.\\\\n\\\\n-----\\\\nCreated by '\n", + " '[ '\n", + " '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n", + " 'you help keep this open source service alive? '\n", + " '**[\\\\ud83d\\\\udc96 Please sponsor : '\n", + " ')](https://prod.download/pull-pr-sponsor)**_\", '\n", + " '\"pull_request.user.login\": \"pull[bot]\", '\n", + " '\"pull_request.user.id\": 39814207, '\n", + " '\"pull_request.author_association\": \"NONE\", '\n", + " '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n", + " '\"pull_request.updated_at\": \"2020-11-23T05:58:39Z\", '\n", + " '\"pull_request.closed_at\": null, '\n", + " '\"pull_request.merged_at\": null, '\n", + " '\"pull_request.merge_commit_sha\": null, '\n", + " '\"pull_request.locked\": false, '\n", + " '\"pull_request.assignee.login\": null, '\n", + " '\"pull_request.assignee.id\": null, '\n", + " '\"pull_request.assignee.type\": null, '\n", + " '\"pull_request.assignee.site_admin\": null, '\n", + " '\"pull_request.milestone.id\": null, '\n", + " '\"pull_request.milestone.number\": null, '\n", + " '\"pull_request.milestone.title\": null, '\n", + " '\"pull_request.milestone.description\": null, '\n", + " '\"pull_request.milestone.creator.login\": null, '\n", + " '\"pull_request.milestone.creator.id\": null, '\n", + " '\"pull_request.milestone.creator.type\": null, '\n", + " '\"pull_request.milestone.creator.site_admin\": null, '\n", + " '\"pull_request.milestone.open_issues\": null, '\n", + " '\"pull_request.milestone.closed_issues\": null, '\n", + " '\"pull_request.milestone.state\": null, '\n", + " '\"pull_request.milestone.created_at\": null, '\n", + " '\"pull_request.milestone.updated_at\": null, '\n", + " '\"pull_request.milestone.due_on\": null, '\n", + " '\"pull_request.milestone.closed_at\": null, '\n", + " '\"pull_request.merged\": false, '\n", + " '\"pull_request.mergeable\": null, '\n", + " '\"pull_request.mergeable_state\": \"unknown\", '\n", + " '\"pull_request.merged_by.login\": null, '\n", + " '\"pull_request.merged_by.id\": null, '\n", + " '\"pull_request.merged_by.type\": null, '\n", + " '\"pull_request.merged_by.site_admin\": null, '\n", + " '\"pull_request.comments\": 0, '\n", + " '\"pull_request.review_comments\": 0, '\n", + " '\"pull_request.commits\": 4, \"pull_request.additions\": '\n", + " '243, \"pull_request.deletions\": 0, '\n", + " '\"pull_request.changed_files\": 2, '\n", + " '\"pull_request.label.id\": null, '\n", + " '\"pull_request.label.name\": null, '\n", + " '\"pull_request.label.color\": null, '\n", + " '\"pull_request.label.default\": null, '\n", + " '\"pull_request.head.label\": \"kubernetes:master\", '\n", + " '\"pull_request.head.ref\": \"master\", '\n", + " '\"pull_request.head.sha\": '\n", + " '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n", + " '\"pull_request.head.user.login\": \"kubernetes\", '\n", + " '\"pull_request.head.user.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.name\": \"website\", '\n", + " '\"pull_request.head.repo.full_name\": '\n", + " '\"kubernetes/website\", '\n", + " '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n", + " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.private\": false, '\n", + " '\"pull_request.head.repo.homepage\": '\n", + " '\"https://kubernetes.io\", '\n", + " '\"pull_request.head.repo.description\": \"Kubernetes '\n", + " 'website and documentation repo: \", '\n", + " '\"pull_request.head.repo.fork\": false, '\n", + " '\"pull_request.head.repo.created_at\": '\n", + " '\"2016-02-10T22:46:48Z\", '\n", + " '\"pull_request.head.repo.updated_at\": '\n", + " '\"2020-11-23T02:09:41Z\", '\n", + " '\"pull_request.head.repo.pushed_at\": '\n", + " '\"2020-11-23T05:12:37Z\", '\n", + " '\"pull_request.head.repo.size\": 319781, '\n", + " '\"pull_request.head.repo.stargazers_count\": 2267, '\n", + " '\"pull_request.head.repo.watchers_count\": 2267, '\n", + " '\"pull_request.head.repo.language\": \"HTML\", '\n", + " '\"pull_request.head.repo.has_issues\": true, '\n", + " '\"pull_request.head.repo.has_projects\": true, '\n", + " '\"pull_request.head.repo.has_downloads\": true, '\n", + " '\"pull_request.head.repo.has_wiki\": true, '\n", + " '\"pull_request.head.repo.has_pages\": false, '\n", + " '\"pull_request.head.repo.forks_count\": 8508, '\n", + " '\"pull_request.head.repo.archived\": false, '\n", + " '\"pull_request.head.repo.disabled\": false, '\n", + " '\"pull_request.head.repo.open_issues_count\": 641, '\n", + " '\"pull_request.head.repo.forks\": 8508, '\n", + " '\"pull_request.head.repo.open_issues\": 641, '\n", + " '\"pull_request.head.repo.watchers\": 2267, '\n", + " '\"pull_request.head.repo.default_branch\": \"master\", '\n", + " '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n", + " '\"pull_request.head.repo.license.spdx_id\": '\n", + " '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n", + " '\"Creative Commons Attribution 4.0 International\", '\n", + " '\"pull_request.base.label\": \"kofj:master\", '\n", + " '\"pull_request.base.ref\": \"master\", '\n", + " '\"pull_request.base.sha\": '\n", + " '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n", + " '\"pull_request.base.user.login\": \"kofj\", '\n", + " '\"pull_request.base.user.type\": \"User\", '\n", + " '\"pull_request.base.repo.name\": \"website\", '\n", + " '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n", + " '\"pull_request.base.repo.owner.login\": \"kofj\", '\n", + " '\"pull_request.base.repo.owner.type\": \"User\", '\n", + " '\"pull_request.base.repo.private\": false, '\n", + " '\"pull_request.base.repo.homepage\": '\n", + " '\"https://kubernetes.io\", '\n", + " '\"pull_request.base.repo.description\": \"Kubernetes '\n", + " 'website and documentation repo: \", '\n", + " '\"pull_request.base.repo.fork\": true, '\n", + " '\"pull_request.base.repo.created_at\": '\n", + " '\"2018-11-24T02:12:25Z\", '\n", + " '\"pull_request.base.repo.updated_at\": '\n", + " '\"2020-11-23T01:58:46Z\", '\n", + " '\"pull_request.base.repo.pushed_at\": '\n", + " '\"2020-11-23T01:58:43Z\", '\n", + " '\"pull_request.base.repo.size\": 286251, '\n", + " '\"pull_request.base.repo.stargazers_count\": 0, '\n", + " '\"pull_request.base.repo.watchers_count\": 0, '\n", + " '\"pull_request.base.repo.language\": \"HTML\", '\n", + " '\"pull_request.base.repo.has_issues\": false, '\n", + " '\"pull_request.base.repo.has_projects\": true, '\n", + " '\"pull_request.base.repo.has_downloads\": true, '\n", + " '\"pull_request.base.repo.has_wiki\": true, '\n", + " '\"pull_request.base.repo.has_pages\": false, '\n", + " '\"pull_request.base.repo.forks_count\": 0, '\n", + " '\"pull_request.base.repo.archived\": false, '\n", + " '\"pull_request.base.repo.disabled\": false, '\n", + " '\"pull_request.base.repo.open_issues_count\": 1, '\n", + " '\"pull_request.base.repo.forks\": 0, '\n", + " '\"pull_request.base.repo.open_issues\": 1, '\n", + " '\"pull_request.base.repo.watchers\": 0, '\n", + " '\"pull_request.base.repo.default_branch\": \"master\", '\n", + " '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n", + " '\"pull_request.base.repo.license.spdx_id\": '\n", + " '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n", + " '\"Creative Commons Attribution 4.0 International\", '\n", + " '\"pull_request.guid\": \"kofj/website/pull/2460\"}, '\n", + " '{\"type\": \"PullRequestEvent\", \"action\": \"closed\", '\n", + " '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n", + " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", + " 'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n", + " '158894695, \"public\": true, \"created_at\": '\n", + " '\"2020-11-23T05:58:50Z\", \"org.id\": null, \"org.login\": '\n", + " 'null, \"pull_request.id\": 525472638, '\n", + " '\"pull_request.number\": 2460, \"pull_request.state\": '\n", + " '\"closed\", \"pull_request.title\": \"[pull] master from '\n", + " 'kubernetes:master\", \"pull_request.body\": \"See '\n", + " '[Commits](/kofj/website/pull/2460/commits) and '\n", + " '[Changes](/kofj/website/pull/2460/files) for more '\n", + " 'details.\\\\n\\\\n-----\\\\nCreated by [ '\n", + " '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n", + " 'you help keep this open source service alive? '\n", + " '**[\\\\ud83d\\\\udc96 Please sponsor : '\n", + " ')](https://prod.download/pull-pr-sponsor)**_\", '\n", + " '\"pull_request.user.login\": \"pull[bot]\", '\n", + " '\"pull_request.user.id\": 39814207, '\n", + " '\"pull_request.author_association\": \"NONE\", '\n", + " '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n", + " '\"pull_request.updated_at\": \"2020-11-23T05:58:50Z\", '\n", + " '\"pull_request.closed_at\": \"2020-11-23T05:58:50Z\", '\n", + " '\"pull_request.merged_at\": \"2020-11-23T05:58:49Z\", '\n", + " '\"pull_request.merge_commit_sha\": '\n", + " '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n", + " '\"pull_request.locked\": false, '\n", + " '\"pull_request.assignee.login\": null, '\n", + " '\"pull_request.assignee.id\": null, '\n", + " '\"pull_request.assignee.type\": null, '\n", + " '\"pull_request.assignee.site_admin\": null, '\n", + " '\"pull_request.milestone.id\": null, '\n", + " '\"pull_request.milestone.number\": null, '\n", + " '\"pull_request.milestone.title\": null, '\n", + " '\"pull_request.milestone.description\": null, '\n", + " '\"pull_request.milestone.creator.login\": null, '\n", + " '\"pull_request.milestone.creator.id\": null, '\n", + " '\"pull_request.milestone.creator.type\": null, '\n", + " '\"pull_request.milestone.creator.site_admin\": null, '\n", + " '\"pull_request.milestone.open_issues\": null, '\n", + " '\"pull_request.milestone.closed_issues\": null, '\n", + " '\"pull_request.milestone.state\": null, '\n", + " '\"pull_request.milestone.created_at\": null, '\n", + " '\"pull_request.milestone.updated_at\": null, '\n", + " '\"pull_request.milestone.due_on\": null, '\n", + " '\"pull_request.milestone.closed_at\": null, '\n", + " '\"pull_request.merged\": true, '\n", + " '\"pull_request.mergeable\": null, '\n", + " '\"pull_request.mergeable_state\": \"unknown\", '\n", + " '\"pull_request.merged_by.login\": \"pull[bot]\", '\n", + " '\"pull_request.merged_by.id\": 39814207, '\n", + " '\"pull_request.merged_by.type\": \"Bot\", '\n", + " '\"pull_request.merged_by.site_admin\": false, '\n", + " '\"pull_request.comments\": 0, '\n", + " '\"pull_request.review_comments\": 0, '\n", + " '\"pull_request.commits\": 4, \"pull_request.additions\": '\n", + " '243, \"pull_request.deletions\": 0, '\n", + " '\"pull_request.changed_files\": 2, '\n", + " '\"pull_request.label.id\": null, '\n", + " '\"pull_request.label.name\": null, '\n", + " '\"pull_request.label.color\": null, '\n", + " '\"pull_request.label.default\": null, '\n", + " '\"pull_request.head.label\": \"kubernetes:master\", '\n", + " '\"pull_request.head.ref\": \"master\", '\n", + " '\"pull_request.head.sha\": '\n", + " '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n", + " '\"pull_request.head.user.login\": \"kubernetes\", '\n", + " '\"pull_request.head.user.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.name\": \"website\", '\n", + " '\"pull_request.head.repo.full_name\": '\n", + " '\"kubernetes/website\", '\n", + " '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n", + " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.private\": false, '\n", + " '\"pull_request.head.repo.homepage\": '\n", + " '\"https://kubernetes.io\", '\n", + " '\"pull_request.head.repo.description\": \"Kubernetes '\n", + " 'website and documentation repo: \", '\n", + " '\"pull_request.head.repo.fork\": false, '\n", + " '\"pull_request.head.repo.created_at\": '\n", + " '\"2016-02-10T22:46:48Z\", '\n", + " '\"pull_request.head.repo.updated_at\": '\n", + " '\"2020-11-23T02:09:41Z\", '\n", + " '\"pull_request.head.repo.pushed_at\": '\n", + " '\"2020-11-23T05:12:37Z\", '\n", + " '\"pull_request.head.repo.size\": 319781, '\n", + " '\"pull_request.head.repo.stargazers_count\": 2267, '\n", + " '\"pull_request.head.repo.watchers_count\": 2267, '\n", + " '\"pull_request.head.repo.language\": \"HTML\", '\n", + " '\"pull_request.head.repo.has_issues\": true, '\n", + " '\"pull_request.head.repo.has_projects\": true, '\n", + " '\"pull_request.head.repo.has_downloads\": true, '\n", + " '\"pull_request.head.repo.has_wiki\": true, '\n", + " '\"pull_request.head.repo.has_pages\": false, '\n", + " '\"pull_request.head.repo.forks_count\": 8508, '\n", + " '\"pull_request.head.repo.archived\": false, '\n", + " '\"pull_request.head.repo.disabled\": false, '\n", + " '\"pull_request.head.repo.open_issues_count\": 641, '\n", + " '\"pull_request.head.repo.forks\": 8508, '\n", + " '\"pull_request.head.repo.open_issues\": 641, '\n", + " '\"pull_request.head.repo.watchers\": 2267, '\n", + " '\"pull_request.head.repo.default_branch\": \"master\", '\n", + " '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n", + " '\"pull_request.head.repo.license.spdx_id\": '\n", + " '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n", + " '\"Creative Commons Attribution 4.0 International\", '\n", + " '\"pull_request.base.label\": \"kofj:master\", '\n", + " '\"pull_request.base.ref\": \"master\", '\n", + " '\"pull_request.base.sha\": '\n", + " '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n", + " '\"pull_request.base.user.login\": \"kofj\", '\n", + " '\"pull_request.base.user.type\": \"User\", '\n", + " '\"pull_request.base.repo.name\": \"website\", '\n", + " '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n", + " '\"pull_request.base.repo.owner.login\": \"kofj\", '\n", + " '\"pull_request.base.repo.owner.type\": \"User\", '\n", + " '\"pull_request.base.repo.private\": false, '\n", + " '\"pull_request.base.repo.homepage\": '\n", + " '\"https://kubernetes.io\", '\n", + " '\"pull_request.base.repo.description\": \"Kubernetes '\n", + " 'website and documentation repo: \", '\n", + " '\"pull_request.base.repo.fork\": true, '\n", + " '\"pull_request.base.repo.created_at\": '\n", + " '\"2018-11-24T02:12:25Z\", '\n", + " '\"pull_request.base.repo.updated_at\": '\n", + " '\"2020-11-23T01:58:46Z\", '\n", + " '\"pull_request.base.repo.pushed_at\": '\n", + " '\"2020-11-23T05:58:46Z\", '\n", + " '\"pull_request.base.repo.size\": 286251, '\n", + " '\"pull_request.base.repo.stargazers_count\": 0, '\n", + " '\"pull_request.base.repo.watchers_count\": 0, '\n", + " '\"pull_request.base.repo.language\": \"HTML\", '\n", + " '\"pull_request.base.repo.has_issues\": false, '\n", + " '\"pull_request.base.repo.has_projects\": true, '\n", + " '\"pull_request.base.repo.has_downloads\": true, '\n", + " '\"pull_request.base.repo.has_wiki\": true, '\n", + " '\"pull_request.base.repo.has_pages\": false, '\n", + " '\"pull_request.base.repo.forks_count\": 0, '\n", + " '\"pull_request.base.repo.archived\": false, '\n", + " '\"pull_request.base.repo.disabled\": false, '\n", + " '\"pull_request.base.repo.open_issues_count\": 0, '\n", + " '\"pull_request.base.repo.forks\": 0, '\n", + " '\"pull_request.base.repo.open_issues\": 0, '\n", + " '\"pull_request.base.repo.watchers\": 0, '\n", + " '\"pull_request.base.repo.default_branch\": \"master\", '\n", + " '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n", + " '\"pull_request.base.repo.license.spdx_id\": '\n", + " '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n", + " '\"Creative Commons Attribution 4.0 International\", '\n", + " '\"pull_request.guid\": \"kofj/website/pull/2460\"}]',\n", + " 'pull_request.guid': 'kofj/website/pull/2460',\n", + " 'pull_request.issue_events': None}\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "pprint(ds[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 412, + "metadata": {}, + "outputs": [], + "source": [ + "# merge all three instances\n", + "\n", + "pull_request_info_cols = [\n", + " \"repo.name\",\n", + " \"repo.id\",\n", + " \"org.id\",\n", + " \"public\",\n", + " \"pull_request.id\",\n", + " \"pull_request.guid\",\n", + " \"pull_request.number\",\n", + " \"pull_request.title\",\n", + " \"pull_request.body\",\n", + " \"pull_request.state\",\n", + " \"pull_request.user.login\",\n", + " \"pull_request.user.id\",\n", + " # add user type\n", + " \"pull_request.head.user.type\",\n", + " \"pull_request.base.user.type\",\n", + " \"pull_request.created_at\",\n", + " \"pull_request.closed_at\",\n", + " \"pull_request.merged_at\",\n", + " \"pull_request.merged_by.login\",\n", + " \"pull_request.milestone.title\",\n", + " \"pull_request.milestone.description\",\n", + " \"pull_request.milestone.number\",\n", + " # commits\n", + " 'pull_request.commits',\n", + " 'pull_request.additions',\n", + " 'pull_request.deletions',\n", + " # changed files\n", + " 'pull_request.changed_files',\n", + " \"pull_request.comments\",\n", + " \"pull_request.review_comments\",\n", + "]\n", + "\n", + "head_info_cols = [\n", + " \"pull_request.head.label\",\n", + " \"pull_request.head.ref\",\n", + " \"pull_request.head.user.login\",\n", + " \"pull_request.head.user.type\",\n", + " \"pull_request.head.repo.owner.login\",\n", + " \"pull_request.head.repo.owner.type\",\n", + " \"pull_request.head.repo.license.name\",\n", + " \"pull_request.head.sha\",\n", + " 'pull_request.head.repo.name',\n", + " 'pull_request.head.repo.owner.login',\n", + " 'pull_request.head.repo.homepage',\n", + " 'pull_request.head.repo.description',\n", + " 'pull_request.head.repo.language',\n", + " 'pull_request.head.repo.stargazers_count',\n", + " 'pull_request.head.repo.license.name',\n", + " 'pull_request.head.repo.default_branch',\n", + " 'pull_request.head.repo.private'\n", + "]\n", + "base_info_cols = [\n", + " \"pull_request.base.label\",\n", + " \"pull_request.base.ref\",\n", + " \"pull_request.base.sha\",\n", + " \"pull_request.base.user.login\",\n", + " \"pull_request.base.user.type\",\n", + " \"pull_request.base.repo.owner.login\",\n", + " \"pull_request.base.repo.owner.type\",\n", + " \"pull_request.base.repo.license.name\",\n", + " \"pull_request.base.repo.default_branch\",\n", + " \"pull_request.base.repo.description\",\n", + " \"pull_request.base.repo.language\",\n", + " \"pull_request.base.repo.watchers_count\",\n", + " \"pull_request.base.repo.open_issues_count\",\n", + " \"pull_request.base.repo.forks_count\",\n", + " 'pull_request.base.repo.name',\n", + " 'pull_request.base.repo.owner.login',\n", + " 'pull_request.base.repo.homepage',\n", + " 'pull_request.base.repo.description',\n", + " 'pull_request.base.repo.language',\n", + " 'pull_request.base.repo.stargazers_count',\n", + " 'pull_request.base.repo.private',\n", + " 'pull_request.comments',\n", + " 'pull_request.review_comments',\n", + " 'pull_request.label.name',\n", + "]\n", + "\n", + "reviews_info = [# review events only\n", + " 'actor.login',\n", + " 'actor.id',\n", + " 'user.login',\n", + " 'user.type',\n", + " 'review.state',\n", + " 'review.id', \n", + " 'review.body', \n", + " 'review.commit_id', \n", + " 'review.submitted_at', \n", + " 'review.author_association',\n", + " \"pull_request.state\",\n", + " \"pull_request.merged\",\n", + " \"pull_request.merged_by.login\",\n", + " \"pull_request.merged_by.type\",\n", + " # comments\n", + " 'comment.id',\n", + " 'comment.diff_hunk',\n", + " 'comment.body',\n", + " 'comment.path',\n", + " 'comment.position',\n", + " 'comment.original_position',\n", + " 'comment.commit_id',\n", + " 'comment.original_commit_id',\n", + " 'comment.created_at',\n", + " 'comment.updated_at',\n", + " 'comment.author_association',\n", + " 'comment.start_line',\n", + " 'comment.original_start_line',\n", + " 'comment.start_side',\n", + " 'comment.line',\n", + " 'comment.original_line',\n", + " 'comment.side',\n", + " 'comment.in_reply_to_id',]\n", + "\n", + "\n", + "issues_info = [\n", + " 'author',\n", + " 'comment',\n", + " 'comment_id']\n", + " \n", + "event_info = reviews_info + issues_info\n", + "\n", + "def get_event_info(review):\n", + " res = {k: review[k] if k in review else None for k in event_info}\n", + " # for keys in issues_info add prefix issue.\n", + " for k in issues_info:\n", + " res[\"issue.\" + k] = res[k]\n", + " del res[k]\n", + " return res\n", + "\n", + "def load_json(data):\n", + " try:\n", + " data = json.loads(data)\n", + " if isinstance(data, dict):\n", + " data = [data]\n", + " return data\n", + " except TypeError:\n", + " return []\n", + "\n", + "def update_datetime(e):\n", + " e[\"created_at\"] = parse(e[\"created_at\"])\n", + " return e\n", + "\n", + "def merge_events(row):\n", + " events = load_json(row[\"pull_request.events\"])\n", + " reviews = load_json(row[\"pull_request.code_review_events\"])\n", + " issues = load_json(row[\"pull_request.issue_events\"])\n", + "\n", + " assert len(issues) <= 1\n", + " if issues:\n", + " issues_events = issues[0][\"events\"]\n", + " # for each events in each category group all events sorted by \"created_at\" in one list\n", + " for e in issues_events:\n", + " e[\"created_at\"] = parse(e[\"datetime\"])\n", + " del e[\"datetime\"]\n", + " else:\n", + " issues_events = []\n", + " events = [update_datetime(e) for e in events]\n", + " reviews = [update_datetime(e) for e in reviews]\n", + " all_events = sorted(\n", + " events + reviews + issues_events,\n", + " key=lambda x: x[\"created_at\"]\n", + " )\n", + " try:\n", + " base_data = events[0] if events else reviews[0]\n", + " except IndexError:\n", + " if issues:\n", + " base_data = issues_events[0]\n", + " first_event = issues[0][\"events\"][0]\n", + " base_data['pull_request.title'] = first_event[\"title\"]\n", + " print(f'base data keys: {base_data.keys()}')\n", + " base_data[\"repo.name\"] = base_data[\"repo\"]\n", + " base_data[\"org.id\"] = base_data[\"org\"]\n", + " base_data[\"repo.name\"] = base_data[\"repo\"]\n", + " base_data[\"pull_request.number\"] = int(base_data[\"pull_request\"][\"number\"])\n", + " base_data[\"pull_request.user.login\"] = base_data[\"pull_request\"][\"user_login\"]\n", + " print(\"filling PR data from issue event\")\n", + " else:\n", + " raise IndexError(\"No events for PR\")\n", + " \n", + " # Initialize with default values\n", + " pr_info = {k: None for k in pull_request_info_cols}\n", + " head_info = {k: None for k in head_info_cols}\n", + " base_info = {k: None for k in base_info_cols}\n", + "\n", + " # Fill available data\n", + " pr_info.update({k: base_data[k] if k in base_data else None for k in pull_request_info_cols})\n", + " head_info.update({k: base_data[k] if k in base_data else None for k in head_info_cols })\n", + " base_info.update({k: base_data[k] if k in base_data else None for k in base_info_cols})\n", + "\n", + " # each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", + " comments = [{\"type\": e[\"type\"],\n", + " \"action\": e[\"action\"],\n", + " \"created_at\": e[\"created_at\"],\n", + " **get_event_info(e)} for e in all_events]\n", + " new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}\n", + " return new_row" + ] + }, + { + "cell_type": "code", + "execution_count": 413, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'repo'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[412], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[413], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n", + "Cell \u001b[0;32mIn[412], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m 172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n", + "\u001b[0;31mKeyError\u001b[0m: 'repo'" + ] + } + ], + "source": [ + "new_row = merge_events(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 411, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "ename": "KeyError", + "evalue": "'repo'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[410], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[411], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m small_ds_2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39mselect(\u001b[39mrange\u001b[39m(\u001b[39m1000\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m dd \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events)\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m 583\u001b[0m \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m 539\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m 540\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m 541\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m 542\u001b[0m \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m 543\u001b[0m }\n\u001b[1;32m 544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 3080\u001b[0m \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m 3081\u001b[0m disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m 3082\u001b[0m unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3085\u001b[0m desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 3086\u001b[0m ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m 3088\u001b[0m \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m 3089\u001b[0m shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m 3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m 3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m 3442\u001b[0m \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m 3443\u001b[0m \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single..apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m 3343\u001b[0m additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m 3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m 3346\u001b[0m processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m 3347\u001b[0m k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m 3348\u001b[0m }\n", + "Cell \u001b[0;32mIn[410], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m 172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n", + "\u001b[0;31mKeyError\u001b[0m: 'repo'" + ] + } + ], + "source": [ + "small_ds_2 = ds.select(range(1000))\n", + "dd = small_ds_2.map(merge_events)" + ] + }, + { + "cell_type": "code", + "execution_count": 405, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['action', 'author', 'comment', 'comment_id', 'description', 'title', 'type', 'created_at'])" + ] + }, + "execution_count": 405, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues_events[0].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 366, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'pull_request.guid': 'karen-kua/COVID-19_Tracker/pull/15',\n", + " 'pull_request.code_review_events': None,\n", + " 'pull_request.events': None,\n", + " 'pull_request.issue_events': '{\"repo\": \"karen-kua/COVID-19_Tracker\", \"org\": null, \"issue_id\": 1018615993, \"issue_number\": 15, \"pull_request\": {\"number\": 15.0, \"repo\": \"COVID-19_Tracker\", \"user_login\": \"karen-kua\"}, \"events\": [{\"action\": \"opened\", \"author\": \"dependabot[bot]\", \"comment\": null, \"comment_id\": null, \"datetime\": \"2021-10-06T15:46:43Z\", \"description\": \"Bumps [url-parse](https://github.com/unshiftio/url-parse) from 1.4.7 to 1.5.3.\\\\n
\\\\nCommits\\\\n
    \\\\n
  • ad44493 [dist] 1.5.3
  • \\\\n
  • c798461 [fix] Fix host parsing for file URLs (#210)
  • \\\\n
  • 201034b [dist] 1.5.2
  • \\\\n
  • 2d9ac2c [fix] Sanitize only special URLs (#209)
  • \\\\n
  • fb128af [fix] Use \\'null\\' as origin for non special URLs
  • \\\\n
  • fed6d9e [fix] Add a leading slash only if the URL is special
  • \\\\n
  • 94872e7 [fix] Do not incorrectly set the slashes property to true
  • \\\\n
  • 81ab967 [fix] Ignore slashes after the protocol for special URLs
  • \\\\n
  • ee22050 [ci] Use GitHub Actions
  • \\\\n
  • d2979b5 [fix] Special case the file: protocol (#204)
  • \\\\n
  • Additional commits viewable in compare view
  • \\\\n
\\\\n
\\\\n
\\\\n\\\\n\\\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=url-parse&package-manager=npm_and_yarn&previous-version=1.4.7&new-version=1.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\\\n\\\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\\\n\\\\n[//]: # (dependabot-automerge-start)\\\\n[//]: # (dependabot-automerge-end)\\\\n\\\\n---\\\\n\\\\n
\\\\nDependabot commands and options\\\\n
\\\\n\\\\nYou can trigger Dependabot actions by commenting on this PR:\\\\n- `@dependabot rebase` will rebase this PR\\\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\\\n- `@dependabot merge` will merge this PR after your CI passes on it\\\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\\\n- `@dependabot reopen` will reopen this PR if it is closed\\\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\\\n\\\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/azukimochi/COVID-19_Tracker/network/alerts).\\\\n\\\\n
\", \"title\": \"Bump url-parse from 1.4.7 to 1.5.3\", \"type\": \"issue\"}, {\"action\": \"created\", \"author\": \"dependabot[bot]\", \"comment\": \"Superseded by #17.\", \"comment_id\": 1045459471.0, \"datetime\": \"2022-02-19 00:53:17+00:00\", \"description\": null, \"title\": null, \"type\": \"comment\"}]}',\n", + " 'bucket': '940',\n", + " '__index_level_0__': 72946}" + ] + }, + "execution_count": 366, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row" + ] + }, + { + "cell_type": "code", + "execution_count": 360, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])" + ] + }, + "execution_count": 360, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues[0].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 361, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'number': 15.0, 'repo': 'COVID-19_Tracker', 'user_login': 'karen-kua'}" + ] + }, + "execution_count": 361, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues[0][\"pull_request\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 351, + "metadata": {}, + "outputs": [], + "source": [ + "small_ds_2 = ds.select(range(500))" + ] + }, + { + "cell_type": "code", + "execution_count": 398, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "ename": "KeyError", + "evalue": "'events'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[396], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n", + "\u001b[0;31mIndexError\u001b[0m: list index out of range", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[398], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m merged_ds \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events, remove_columns\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mpull_request.events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.code_review_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.issue_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m__index_level_0__\u001b[39;49m\u001b[39m'\u001b[39;49m,\u001b[39m'\u001b[39;49m\u001b[39mpull_request.guid\u001b[39;49m\u001b[39m'\u001b[39;49m])\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m 583\u001b[0m \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m 539\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m 540\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m 541\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m 542\u001b[0m \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m 543\u001b[0m }\n\u001b[1;32m 544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 3080\u001b[0m \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m 3081\u001b[0m disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m 3082\u001b[0m unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3085\u001b[0m desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 3086\u001b[0m ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m 3088\u001b[0m \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m 3089\u001b[0m shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m 3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m 3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m 3442\u001b[0m \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m 3443\u001b[0m \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", + "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single..apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m 3343\u001b[0m additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m 3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m 3346\u001b[0m processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m 3347\u001b[0m k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m 3348\u001b[0m }\n", + "Cell \u001b[0;32mIn[396], line 170\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n\u001b[1;32m 169\u001b[0m base_data \u001b[39m=\u001b[39m issues_events[\u001b[39m0\u001b[39m]\n\u001b[0;32m--> 170\u001b[0m first_event \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mevents\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m0\u001b[39m]\n\u001b[1;32m 171\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 172\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n", + "\u001b[0;31mKeyError\u001b[0m: 'events'" + ] + } + ], + "source": [ + "merged_ds = small_ds_2.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 31.42ba/s]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:10<00:00, 10.30s/it]\n", + "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:11<00:00, 11.45s/it]\n" + ] + } + ], + "source": [ + "merged_ds.push_to_hub(\"loubnabnl/code_reviews_3\")" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading readme: 100%|██████████| 5.88k/5.88k [00:00<00:00, 3.76MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading and preparing dataset None/None to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading data: 100%|██████████| 16.1M/16.1M [00:00<00:00, 17.4MB/s]\n", + "Downloading data files: 100%|██████████| 1/1 [00:02<00:00, 2.65s/it]\n", + "Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 676.50it/s]\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset parquet downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['bucket', 'pull_request_info', 'head_repo_info', 'base_repo_info', 'events'],\n", + " num_rows: 10000\n", + "})" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'__index_level_0__': 1028,\n", + " 'bucket': None,\n", + " 'pull_request.code_review_events': None,\n", + " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n", + " '\"actor.login\": \"M-Davies\", \"actor.id\": 25231953, '\n", + " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", + " 'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n", + " '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n", + " '\"2020-05-28T09:45:30Z\", \"org.id\": 1673867, '\n", + " '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n", + " '424372800, \"pull_request.number\": 1787, '\n", + " '\"pull_request.state\": \"open\", \"pull_request.title\": '\n", + " '\"Revert \\'Fire installer failure on all failed '\n", + " 'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n", + " 'lot of false positives or just doesnt work overall. '\n", + " 'Better to just remove for '\n", + " 'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n", + " '\", \"pull_request.user.login\": '\n", + " '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n", + " '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n", + " '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n", + " '\"pull_request.updated_at\": \"2020-05-28T09:45:30Z\", '\n", + " '\"pull_request.closed_at\": null, '\n", + " '\"pull_request.merged_at\": null, '\n", + " '\"pull_request.merge_commit_sha\": null, '\n", + " '\"pull_request.locked\": false, '\n", + " '\"pull_request.assignee.login\": null, '\n", + " '\"pull_request.assignee.id\": null, '\n", + " '\"pull_request.assignee.type\": null, '\n", + " '\"pull_request.assignee.site_admin\": null, '\n", + " '\"pull_request.milestone.id\": null, '\n", + " '\"pull_request.milestone.number\": null, '\n", + " '\"pull_request.milestone.title\": null, '\n", + " '\"pull_request.milestone.description\": null, '\n", + " '\"pull_request.milestone.creator.login\": null, '\n", + " '\"pull_request.milestone.creator.id\": null, '\n", + " '\"pull_request.milestone.creator.type\": null, '\n", + " '\"pull_request.milestone.creator.site_admin\": null, '\n", + " '\"pull_request.milestone.open_issues\": null, '\n", + " '\"pull_request.milestone.closed_issues\": null, '\n", + " '\"pull_request.milestone.state\": null, '\n", + " '\"pull_request.milestone.created_at\": null, '\n", + " '\"pull_request.milestone.updated_at\": null, '\n", + " '\"pull_request.milestone.due_on\": null, '\n", + " '\"pull_request.milestone.closed_at\": null, '\n", + " '\"pull_request.merged\": false, '\n", + " '\"pull_request.mergeable\": null, '\n", + " '\"pull_request.mergeable_state\": \"unknown\", '\n", + " '\"pull_request.merged_by.login\": null, '\n", + " '\"pull_request.merged_by.id\": null, '\n", + " '\"pull_request.merged_by.type\": null, '\n", + " '\"pull_request.merged_by.site_admin\": null, '\n", + " '\"pull_request.comments\": 0, '\n", + " '\"pull_request.review_comments\": 0, '\n", + " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", + " '4, \"pull_request.deletions\": 6, '\n", + " '\"pull_request.changed_files\": 1, '\n", + " '\"pull_request.label.id\": null, '\n", + " '\"pull_request.label.name\": null, '\n", + " '\"pull_request.label.color\": null, '\n", + " '\"pull_request.label.default\": null, '\n", + " '\"pull_request.head.label\": \"M-Davies:revert\", '\n", + " '\"pull_request.head.ref\": \"revert\", '\n", + " '\"pull_request.head.sha\": '\n", + " '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n", + " '\"pull_request.head.user.login\": \"M-Davies\", '\n", + " '\"pull_request.head.user.type\": \"User\", '\n", + " '\"pull_request.head.repo.name\": \"openjdk-build\", '\n", + " '\"pull_request.head.repo.full_name\": '\n", + " '\"M-Davies/openjdk-build\", '\n", + " '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n", + " '\"pull_request.head.repo.owner.type\": \"User\", '\n", + " '\"pull_request.head.repo.private\": false, '\n", + " '\"pull_request.head.repo.homepage\": \"\", '\n", + " '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n", + " 'community OpenJDK build scripts - common across all '\n", + " 'releases/versions\", \"pull_request.head.repo.fork\": '\n", + " 'true, \"pull_request.head.repo.created_at\": '\n", + " '\"2019-11-29T09:24:43Z\", '\n", + " '\"pull_request.head.repo.updated_at\": '\n", + " '\"2020-05-27T14:45:16Z\", '\n", + " '\"pull_request.head.repo.pushed_at\": '\n", + " '\"2020-05-27T14:45:13Z\", '\n", + " '\"pull_request.head.repo.size\": 2383, '\n", + " '\"pull_request.head.repo.stargazers_count\": 0, '\n", + " '\"pull_request.head.repo.watchers_count\": 0, '\n", + " '\"pull_request.head.repo.language\": \"Shell\", '\n", + " '\"pull_request.head.repo.has_issues\": false, '\n", + " '\"pull_request.head.repo.has_projects\": true, '\n", + " '\"pull_request.head.repo.has_downloads\": true, '\n", + " '\"pull_request.head.repo.has_wiki\": true, '\n", + " '\"pull_request.head.repo.has_pages\": false, '\n", + " '\"pull_request.head.repo.forks_count\": 0, '\n", + " '\"pull_request.head.repo.archived\": false, '\n", + " '\"pull_request.head.repo.disabled\": false, '\n", + " '\"pull_request.head.repo.open_issues_count\": 0, '\n", + " '\"pull_request.head.repo.forks\": 0, '\n", + " '\"pull_request.head.repo.open_issues\": 0, '\n", + " '\"pull_request.head.repo.watchers\": 0, '\n", + " '\"pull_request.head.repo.default_branch\": \"master\", '\n", + " '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n", + " '\"pull_request.head.repo.license.spdx_id\": '\n", + " '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n", + " '\"Apache License 2.0\", \"pull_request.base.label\": '\n", + " '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n", + " '\"master\", \"pull_request.base.sha\": '\n", + " '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n", + " '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n", + " '\"pull_request.base.user.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.name\": \"openjdk-build\", '\n", + " '\"pull_request.base.repo.full_name\": '\n", + " '\"AdoptOpenJDK/openjdk-build\", '\n", + " '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n", + " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.private\": false, '\n", + " '\"pull_request.base.repo.homepage\": \"\", '\n", + " '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n", + " 'community OpenJDK build scripts - common across all '\n", + " 'releases/versions\", \"pull_request.base.repo.fork\": '\n", + " 'false, \"pull_request.base.repo.created_at\": '\n", + " '\"2017-03-17T09:31:50Z\", '\n", + " '\"pull_request.base.repo.updated_at\": '\n", + " '\"2020-05-28T07:45:12Z\", '\n", + " '\"pull_request.base.repo.pushed_at\": '\n", + " '\"2020-05-27T14:18:11Z\", '\n", + " '\"pull_request.base.repo.size\": 2234, '\n", + " '\"pull_request.base.repo.stargazers_count\": 620, '\n", + " '\"pull_request.base.repo.watchers_count\": 620, '\n", + " '\"pull_request.base.repo.language\": \"Shell\", '\n", + " '\"pull_request.base.repo.has_issues\": true, '\n", + " '\"pull_request.base.repo.has_projects\": true, '\n", + " '\"pull_request.base.repo.has_downloads\": true, '\n", + " '\"pull_request.base.repo.has_wiki\": true, '\n", + " '\"pull_request.base.repo.has_pages\": false, '\n", + " '\"pull_request.base.repo.forks_count\": 137, '\n", + " '\"pull_request.base.repo.archived\": false, '\n", + " '\"pull_request.base.repo.disabled\": false, '\n", + " '\"pull_request.base.repo.open_issues_count\": 166, '\n", + " '\"pull_request.base.repo.forks\": 137, '\n", + " '\"pull_request.base.repo.open_issues\": 166, '\n", + " '\"pull_request.base.repo.watchers\": 620, '\n", + " '\"pull_request.base.repo.default_branch\": \"master\", '\n", + " '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n", + " '\"pull_request.base.repo.license.spdx_id\": '\n", + " '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n", + " '\"Apache License 2.0\", \"pull_request.guid\": '\n", + " '\"AdoptOpenJDK/openjdk-build/pull/1787\"}, {\"type\": '\n", + " '\"PullRequestEvent\", \"action\": \"closed\", '\n", + " '\"actor.login\": \"sxa\", \"actor.id\": 6487691, '\n", + " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", + " 'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n", + " '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n", + " '\"2020-05-28T09:51:49Z\", \"org.id\": 1673867, '\n", + " '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n", + " '424372800, \"pull_request.number\": 1787, '\n", + " '\"pull_request.state\": \"closed\", \"pull_request.title\": '\n", + " '\"Revert \\'Fire installer failure on all failed '\n", + " 'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n", + " 'lot of false positives or just doesnt work overall. '\n", + " 'Better to just remove for '\n", + " 'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n", + " '\", \"pull_request.user.login\": '\n", + " '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n", + " '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n", + " '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n", + " '\"pull_request.updated_at\": \"2020-05-28T09:51:48Z\", '\n", + " '\"pull_request.closed_at\": \"2020-05-28T09:51:48Z\", '\n", + " '\"pull_request.merged_at\": \"2020-05-28T09:51:48Z\", '\n", + " '\"pull_request.merge_commit_sha\": '\n", + " '\"4c3495c6f008459ca1c276477c5f968e9dcd7c6b\", '\n", + " '\"pull_request.locked\": false, '\n", + " '\"pull_request.assignee.login\": null, '\n", + " '\"pull_request.assignee.id\": null, '\n", + " '\"pull_request.assignee.type\": null, '\n", + " '\"pull_request.assignee.site_admin\": null, '\n", + " '\"pull_request.milestone.id\": null, '\n", + " '\"pull_request.milestone.number\": null, '\n", + " '\"pull_request.milestone.title\": null, '\n", + " '\"pull_request.milestone.description\": null, '\n", + " '\"pull_request.milestone.creator.login\": null, '\n", + " '\"pull_request.milestone.creator.id\": null, '\n", + " '\"pull_request.milestone.creator.type\": null, '\n", + " '\"pull_request.milestone.creator.site_admin\": null, '\n", + " '\"pull_request.milestone.open_issues\": null, '\n", + " '\"pull_request.milestone.closed_issues\": null, '\n", + " '\"pull_request.milestone.state\": null, '\n", + " '\"pull_request.milestone.created_at\": null, '\n", + " '\"pull_request.milestone.updated_at\": null, '\n", + " '\"pull_request.milestone.due_on\": null, '\n", + " '\"pull_request.milestone.closed_at\": null, '\n", + " '\"pull_request.merged\": true, '\n", + " '\"pull_request.mergeable\": null, '\n", + " '\"pull_request.mergeable_state\": \"unknown\", '\n", + " '\"pull_request.merged_by.login\": \"sxa\", '\n", + " '\"pull_request.merged_by.id\": 6487691, '\n", + " '\"pull_request.merged_by.type\": \"User\", '\n", + " '\"pull_request.merged_by.site_admin\": false, '\n", + " '\"pull_request.comments\": 0, '\n", + " '\"pull_request.review_comments\": 0, '\n", + " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", + " '4, \"pull_request.deletions\": 6, '\n", + " '\"pull_request.changed_files\": 1, '\n", + " '\"pull_request.label.id\": null, '\n", + " '\"pull_request.label.name\": null, '\n", + " '\"pull_request.label.color\": null, '\n", + " '\"pull_request.label.default\": null, '\n", + " '\"pull_request.head.label\": \"M-Davies:revert\", '\n", + " '\"pull_request.head.ref\": \"revert\", '\n", + " '\"pull_request.head.sha\": '\n", + " '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n", + " '\"pull_request.head.user.login\": \"M-Davies\", '\n", + " '\"pull_request.head.user.type\": \"User\", '\n", + " '\"pull_request.head.repo.name\": \"openjdk-build\", '\n", + " '\"pull_request.head.repo.full_name\": '\n", + " '\"M-Davies/openjdk-build\", '\n", + " '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n", + " '\"pull_request.head.repo.owner.type\": \"User\", '\n", + " '\"pull_request.head.repo.private\": false, '\n", + " '\"pull_request.head.repo.homepage\": \"\", '\n", + " '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n", + " 'community OpenJDK build scripts - common across all '\n", + " 'releases/versions\", \"pull_request.head.repo.fork\": '\n", + " 'true, \"pull_request.head.repo.created_at\": '\n", + " '\"2019-11-29T09:24:43Z\", '\n", + " '\"pull_request.head.repo.updated_at\": '\n", + " '\"2020-05-27T14:45:16Z\", '\n", + " '\"pull_request.head.repo.pushed_at\": '\n", + " '\"2020-05-28T09:46:04Z\", '\n", + " '\"pull_request.head.repo.size\": 2383, '\n", + " '\"pull_request.head.repo.stargazers_count\": 0, '\n", + " '\"pull_request.head.repo.watchers_count\": 0, '\n", + " '\"pull_request.head.repo.language\": \"Shell\", '\n", + " '\"pull_request.head.repo.has_issues\": false, '\n", + " '\"pull_request.head.repo.has_projects\": true, '\n", + " '\"pull_request.head.repo.has_downloads\": true, '\n", + " '\"pull_request.head.repo.has_wiki\": true, '\n", + " '\"pull_request.head.repo.has_pages\": false, '\n", + " '\"pull_request.head.repo.forks_count\": 0, '\n", + " '\"pull_request.head.repo.archived\": false, '\n", + " '\"pull_request.head.repo.disabled\": false, '\n", + " '\"pull_request.head.repo.open_issues_count\": 0, '\n", + " '\"pull_request.head.repo.forks\": 0, '\n", + " '\"pull_request.head.repo.open_issues\": 0, '\n", + " '\"pull_request.head.repo.watchers\": 0, '\n", + " '\"pull_request.head.repo.default_branch\": \"master\", '\n", + " '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n", + " '\"pull_request.head.repo.license.spdx_id\": '\n", + " '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n", + " '\"Apache License 2.0\", \"pull_request.base.label\": '\n", + " '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n", + " '\"master\", \"pull_request.base.sha\": '\n", + " '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n", + " '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n", + " '\"pull_request.base.user.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.name\": \"openjdk-build\", '\n", + " '\"pull_request.base.repo.full_name\": '\n", + " '\"AdoptOpenJDK/openjdk-build\", '\n", + " '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n", + " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.private\": false, '\n", + " '\"pull_request.base.repo.homepage\": \"\", '\n", + " '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n", + " 'community OpenJDK build scripts - common across all '\n", + " 'releases/versions\", \"pull_request.base.repo.fork\": '\n", + " 'false, \"pull_request.base.repo.created_at\": '\n", + " '\"2017-03-17T09:31:50Z\", '\n", + " '\"pull_request.base.repo.updated_at\": '\n", + " '\"2020-05-28T07:45:12Z\", '\n", + " '\"pull_request.base.repo.pushed_at\": '\n", + " '\"2020-05-28T09:51:48Z\", '\n", + " '\"pull_request.base.repo.size\": 2234, '\n", + " '\"pull_request.base.repo.stargazers_count\": 620, '\n", + " '\"pull_request.base.repo.watchers_count\": 620, '\n", + " '\"pull_request.base.repo.language\": \"Shell\", '\n", + " '\"pull_request.base.repo.has_issues\": true, '\n", + " '\"pull_request.base.repo.has_projects\": true, '\n", + " '\"pull_request.base.repo.has_downloads\": true, '\n", + " '\"pull_request.base.repo.has_wiki\": true, '\n", + " '\"pull_request.base.repo.has_pages\": false, '\n", + " '\"pull_request.base.repo.forks_count\": 137, '\n", + " '\"pull_request.base.repo.archived\": false, '\n", + " '\"pull_request.base.repo.disabled\": false, '\n", + " '\"pull_request.base.repo.open_issues_count\": 165, '\n", + " '\"pull_request.base.repo.forks\": 137, '\n", + " '\"pull_request.base.repo.open_issues\": 165, '\n", + " '\"pull_request.base.repo.watchers\": 620, '\n", + " '\"pull_request.base.repo.default_branch\": \"master\", '\n", + " '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n", + " '\"pull_request.base.repo.license.spdx_id\": '\n", + " '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n", + " '\"Apache License 2.0\", \"pull_request.guid\": '\n", + " '\"AdoptOpenJDK/openjdk-build/pull/1787\"}]',\n", + " 'pull_request.guid': 'AdoptOpenJDK/openjdk-build/pull/1787',\n", + " 'pull_request.issue_events': None}\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "pprint(small_ds[50])" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'action': 'opened',\n", + " 'actor.id': 25231953,\n", + " 'actor.login': 'M-Davies',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2020, 5, 28, 9, 45, 30, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'closed',\n", + " 'actor.id': 6487691,\n", + " 'actor.login': 'sxa',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2020, 5, 28, 9, 51, 49, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.merged_by.login': 'sxa',\n", + " 'pull_request.merged_by.type': 'User',\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}]\n" + ] + } + ], + "source": [ + "pprint(merged_ds[50][\"events\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 222, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--code_reviews_3-c3e4ac735edf14b4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n" + ] + } + ], + "source": [ + "ds = load_dataset(\"loubnabnl/code_reviews_3\", split=\"train\")\n", + "size = len(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 223, + "metadata": {}, + "outputs": [], + "source": [ + "sample = ds[1470]\n", + "events = sample[\"events\"]\n", + "grouped_events = create_grouped_events(events)\n", + "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "\n", + "pprint(small_ds[50])" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "📝 **Title**: Fix @inheritDocs behavior
\n", + " 📦 **GitHub Repo**: Azure/azure-sdk-for-java, PR Number: 26816, ID: 836647691.
\n", + " Link: [https://github.com/Azure/azure-sdk-for-java/pull/26816](https://github.com/Azure/azure-sdk-for-java/pull/26816)" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AttributeDetail
🧾 PR Typeissue
🟢 PR Stateopen
👤 PR Authorkasobol-msft
🏷️ Head Branchref: kasobol-msft-patch-1, label: Azure:kasobol-msft-patch-1
🌳 Base Branchmain
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Make sure that dependency sources are included in javadoc generation.\r\n", + "\r\n", + "Fixes https://github.com/Azure/azure-sdk-for-java/issues/26814" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def get_pr_info(sample):\n", + " pr_info = sample[\"pull_request_info\"]\n", + " head_info = sample[\"head_repo_info\"]\n", + " base_info = sample[\"base_repo_info\"]\n", + " events = sample[\"events\"]\n", + "\n", + " gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", + "\n", + " header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}
\n", + " 📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.
\n", + " Link: [{gh_link}]({gh_link})\"\"\"\n", + " pr_info_html = f\"\"\"\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AttributeDetail
🧾 PR Type{events[0]['type']}
🟢 PR State{pr_info['pull_request.state']}
👤 PR Author{pr_info['pull_request.user.login']}
🏷️ Head Branchref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}
🌳 Base Branch{base_info['pull_request.base.ref']}
\n", + " \"\"\"\n", + " return header, pr_info_html, pr_info['pull_request.body']\n", + "\n", + "from IPython.display import HTML, display\n", + "display(HTML(get_pr_info(sample)[0]))\n", + "display(HTML(get_pr_info(sample)[1]))\n", + "display(HTML(get_pr_info(sample)[2]))" + ] + }, + { + "cell_type": "code", + "execution_count": 308, + "metadata": {}, + "outputs": [], + "source": [ + "sample = ds[4]\n", + "events = sample[\"events\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 309, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 309, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(events)" + ] + }, + { + "cell_type": "code", + "execution_count": 310, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'action': 'opened',\n", + " 'actor.id': 39814207,\n", + " 'actor.login': 'pull[bot]',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2022, 10, 10, 10, 57, 41, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'closed',\n", + " 'actor.id': 39814207,\n", + " 'actor.login': 'pull[bot]',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2022, 10, 10, 11, 1, 28, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.merged_by.login': 'pull[bot]',\n", + " 'pull_request.merged_by.type': 'Bot',\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}]\n" + ] + } + ], + "source": [ + "pprint(events)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import uuid\n", + "\n", + "def create_grouped_events(events):\n", + " df = pd.DataFrame(events)\n", + " # Ensure it's in datetime format\n", + " df['created_at'] = pd.to_datetime(df['created_at'])\n", + " # Create a new column 'uuid' initialized with None\n", + " df['uuid'] = None\n", + " # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n", + " mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n", + " df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n", + " # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n", + " grouped_events = [group.drop(columns='uuid').to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n", + " return grouped_events\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "846\n" + ] + } + ], + "source": [ + "for i in range(len(ds)):\n", + " e = ds[i]\n", + " if e[\"events\"][0][\"comment.diff_hunk\"]:\n", + " print(i)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 299, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'action': 'opened',\n", + " 'actor.id': 1753262,\n", + " 'actor.login': 'mo9a7i',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'created',\n", + " 'actor.id': 1753262,\n", + " 'actor.login': 'mo9a7i',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': 'MEMBER',\n", + " 'review.body': 'looks fine',\n", + " 'review.commit_id': 'ba75444d1ada77cf5f3f06cd74b6320bab8db54b',\n", + " 'review.id': 962846794,\n", + " 'review.state': 'commented',\n", + " 'review.submitted_at': '2022-05-05T04:35:02Z',\n", + " 'type': 'PullRequestReviewEvent',\n", + " 'user.login': 'mo9a7i',\n", + " 'user.type': 'User'},\n", + " {'action': 'closed',\n", + " 'actor.id': 1753262,\n", + " 'actor.login': 'mo9a7i',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2022, 5, 5, 4, 35, 3, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.merged_by.login': 'mo9a7i',\n", + " 'pull_request.merged_by.type': 'User',\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}]\n" + ] + } + ], + "source": [ + "pprint(events)" + ] + }, + { + "cell_type": "code", + "execution_count": 303, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
actionactor.idactor.logincomment.author_associationcomment.bodycomment.commit_idcomment.created_atcomment.diff_hunkcomment.idcomment.in_reply_to_id...review.author_associationreview.bodyreview.commit_idreview.idreview.statereview.submitted_attypeuser.loginuser.typegroup_key
0opened1753262mo9a7iNoneNoneNoneNoneNoneNoneNone...NoneNoneNoneNaNNoneNonePullRequestEventNoneNone1.0
1created1753262mo9a7iNoneNoneNoneNoneNoneNoneNone...MEMBERlooks fineba75444d1ada77cf5f3f06cd74b6320bab8db54b962846794.0commented2022-05-05T04:35:02ZPullRequestReviewEventmo9a7iUserba75444d1ada77cf5f3f06cd74b6320bab8db54b
2closed1753262mo9a7iNoneNoneNoneNoneNoneNoneNone...NoneNoneNoneNaNNoneNonePullRequestEventNoneNone2.0
\n", + "

3 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " action actor.id actor.login comment.author_association comment.body \n", + "0 opened 1753262 mo9a7i None None \\\n", + "1 created 1753262 mo9a7i None None \n", + "2 closed 1753262 mo9a7i None None \n", + "\n", + " comment.commit_id comment.created_at comment.diff_hunk comment.id \n", + "0 None None None None \\\n", + "1 None None None None \n", + "2 None None None None \n", + "\n", + " comment.in_reply_to_id ... review.author_association review.body \n", + "0 None ... None None \\\n", + "1 None ... MEMBER looks fine \n", + "2 None ... None None \n", + "\n", + " review.commit_id review.id review.state \n", + "0 None NaN None \\\n", + "1 ba75444d1ada77cf5f3f06cd74b6320bab8db54b 962846794.0 commented \n", + "2 None NaN None \n", + "\n", + " review.submitted_at type user.login user.type \n", + "0 None PullRequestEvent None None \\\n", + "1 2022-05-05T04:35:02Z PullRequestReviewEvent mo9a7i User \n", + "2 None PullRequestEvent None None \n", + "\n", + " group_key \n", + "0 1.0 \n", + "1 ba75444d1ada77cf5f3f06cd74b6320bab8db54b \n", + "2 2.0 \n", + "\n", + "[3 rows x 39 columns]" + ] + }, + "execution_count": 303, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "df = pd.DataFrame(events)\n", + "df['created_at'] = pd.to_datetime(df['created_at'])\n", + "df.drop_duplicates(inplace=True)\n", + "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n", + "mask = df['review.commit_id'].isnull()\n", + "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n", + "df.loc[~mask, 'group_key'] = df['review.commit_id']\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 304, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "df = pd.DataFrame(events)\n", + "df['created_at'] = pd.to_datetime(df['created_at'])\n", + "df.drop_duplicates(inplace=True)\n", + "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n", + "mask = df['review.commit_id'].isnull()\n", + "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n", + "df.loc[~mask, 'group_key'] = df['review.commit_id']\n", + "\n", + "if len(df) == 1:\n", + " grouped_events = [[df.iloc[0].to_dict()]]\n", + "else:\n", + " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n", + "\n", + "# sort by first event date\n", + "grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 311, + "metadata": {}, + "outputs": [], + "source": [ + "def create_grouped_events(events):\n", + " \"\"\"group events that happened in the same review thread using review.commit_id\"\"\"\n", + " df = pd.DataFrame(events)\n", + " df['created_at'] = pd.to_datetime(df['created_at'])\n", + " df.drop_duplicates(inplace=True)\n", + " # Create a new 'group_key' where rows with NaN 'review.commit_id' get an identical identifier. Otherwise NaN values go in the same group\n", + " mask = df['review.commit_id'].isnull()\n", + " df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n", + " df.loc[~mask, 'group_key'] = df['review.commit_id']\n", + " \n", + " if len(df) == 1:\n", + " grouped_events = [[df.iloc[0].to_dict()]]\n", + " else:\n", + " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n", + " \n", + " # sort by first event date\n", + " grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n", + " return grouped_events\n", + "\n", + "grouped_events = create_grouped_events(events)" + ] + }, + { + "cell_type": "code", + "execution_count": 312, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len events 2 and len grouped_events 2\n" + ] + } + ], + "source": [ + "print(f\"len events {len(events)} and len grouped_events {len(grouped_events)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 313, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "thread number 0\n", + "thread number 1\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \"\"\n", + " \n", + " \n", + "
Event TypePullRequestEvent
Userpull[bot]
Actionopened
Review StateNone
PR Stateopen, merged: False
Date2022-10-10 10:57:41+00:00
\n", + "
\n", + " \n", + "---------------------------------------------------------------------------------------------------------------------------------------------------------------------
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \"\"\n", + " \n", + " \n", + "
Event TypePullRequestEvent
Userpull[bot]
Actionclosed
Review StateNone
PR Stateclosed, merged: True
Date2022-10-10 11:01:28+00:00
\n", + "
\n", + " \n", + "---------------------------------------------------------------------------------------------------------------------------------------------------------------------
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", + "thread_html = \"\"\n", + "c = 0\n", + "for thread in grouped_events:\n", + " print(f\"thread number {c}\")\n", + " c += 1\n", + " thread_html += '
'\n", + " # Get the first event in the thread as a reference\n", + " first_event = thread[0]\n", + " poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n", + " # Add shared parts of the events only once\n", + " user_type = f\"(type :{first_event['user.type']})\" if first_event['user.type'] else \"\"\n", + " review_state = f\"Review State{first_event['review.state']}\" if first_event['review.state'] else \"\"\n", + " text = f\"\"\"\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " {review_state}\n", + " \n", + " \n", + "
Event Type{first_event['type']}
User{poster_name} {user_type}
Action{first_event['action']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
Date{first_event['created_at']}
\n", + "
\n", + " \"\"\"\n", + " highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n", + " highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n", + "\n", + " text = f\"\"\"\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \"\"\n", + " \n", + " \n", + "
Event Type{first_event['type']}
User{poster_name} {user_type}
Action{first_event['action']}
Review State{first_event['review.state']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
Date{first_event['created_at']}
\n", + "
\n", + " \"\"\"\n", + "\n", + "\n", + " thread_html += text\n", + " thread_html += (\"\\n\" + \"-\"*165)\n", + " # Add the bodies of the comments for each event in the thread\n", + " for event in thread:\n", + " # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n", + " poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n", + " if event['comment.body'] or event[\"issue.comment\"]:\n", + " is_op = original_poster == poster_name\n", + " thread_html += format_body(event['comment.body'], poster_name, is_op)\n", + "\n", + " thread_html += '
'\n", + "\n", + "display(HTML(thread_html))" + ] + }, + { + "cell_type": "code", + "execution_count": 314, + "metadata": {}, + "outputs": [], + "source": [ + "def display_events(sample):\n", + " events = sample[\"events\"]\n", + " grouped_events = create_grouped_events(events)\n", + " original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", + " for thread in grouped_events:\n", + " thread_html = '
'\n", + " # Get the first event in the thread as a reference\n", + " first_event = thread[0]\n", + " poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n", + " # Add shared parts of the events only once\n", + " user_type = f\"(type :{first_event['user.type']})\" if first_event['user.type'] else \"\"\n", + " highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n", + " highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n", + " \n", + " text = f\"\"\"\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event Type{first_event['type']}
User{poster_name} {user_type}
Action{first_event['action']}
Review State{first_event['review.state']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
Date{first_event['created_at']}
\n", + "
\n", + " \"\"\"\n", + " print(f\"added first event of teh group\")\n", + " thread_html += text\n", + " \n", + " # Add the bodies of the comments for each event in the thread\n", + " for event in thread:\n", + " # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n", + " poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n", + " if event['comment.body'] or event[\"issue.comment\"]:\n", + " is_op = original_poster == poster_name\n", + " thread_html += format_body(event['comment.body'], poster_name, is_op)\n", + "\n", + " thread_html += '
'\n", + " display(HTML(thread_html))\n", + " if first_event['comment.path']:\n", + " path_html = f\"Path: {first_event['comment.path']}\"\n", + " display(HTML(path_html))\n", + " display(HTML(\"---\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 316, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[{'action': 'opened',\n", + " 'actor.id': 39814207,\n", + " 'actor.login': 'pull[bot]',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': Timestamp('2022-10-10 10:57:41+0000', tz='UTC'),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None,\n", + " 'group_key': 1.0}],\n", + " [{'action': 'closed',\n", + " 'actor.id': 39814207,\n", + " 'actor.login': 'pull[bot]',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': Timestamp('2022-10-10 11:01:28+0000', tz='UTC'),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.merged_by.login': 'pull[bot]',\n", + " 'pull_request.merged_by.type': 'Bot',\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None,\n", + " 'group_key': 2.0}]]" + ] + }, + "execution_count": 316, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grouped_events" + ] + }, + { + "cell_type": "code", + "execution_count": 315, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event TypePullRequestEvent
Userpull[bot]
Actionopened
Review StateNone
PR Stateopen, merged: False
Date2022-10-10 10:57:41+00:00
\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event TypePullRequestEvent
Userpull[bot]
Actionopened
Review StateNone
PR Stateopen, merged: False
Date2022-10-10 10:57:41+00:00
\n", + "
\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event TypePullRequestEvent
Userpull[bot]
Actionclosed
Review StateNone
PR Stateclosed, merged: True
Date2022-10-10 11:01:28+00:00
\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "---" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display_events(sample)" + ] + }, + { + "cell_type": "code", + "execution_count": 261, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "single\n", + "2022-05-05 04:35:02+00:00\n", + "with review state: commented\n", + "None\n", + "User: mo9a7i, action: created\n", + "PullRequestReviewEvent\n", + "------------\n", + "multiple\n", + "single\n", + "2022-05-05 04:35:02+00:00\n", + "with review state: None\n", + "None\n", + "User: mo9a7i, action: opened\n", + "PullRequestEvent\n", + "------------\n", + "------------\n", + "2022-05-05 04:35:02+00:00\n", + "with review state: None PR state False\n", + "None\n", + "User: mo9a7i, action: closed\n", + "PullRequestEvent\n", + "------------\n", + "------------end multiple\n" + ] + } + ], + "source": [ + "for group in grouped_events:\n", + " if len(group) == 1:\n", + " poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n", + " print(\"single\")\n", + " print(group[0][\"created_at\"])\n", + " print(f\"with review state: {group[0]['review.state']}\")\n", + " print(group[0][\"comment.body\"])\n", + " # print action type and user\n", + " print(f\"User: {poster_name}, action: {group[0]['action']}\")\n", + " print(group[0][\"type\"])\n", + " print(\"------------\")\n", + " continue\n", + " # date \n", + " else:\n", + " print(\"multiple\")\n", + " poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n", + " print(\"single\")\n", + " print(group[0][\"created_at\"])\n", + " print(f\"with review state: {group[0]['review.state']}\")\n", + " print(group[0][\"comment.body\"])\n", + " # print action type and user\n", + " print(f\"User: {poster_name}, action: {group[0]['action']}\")\n", + " print(group[0][\"type\"])\n", + " print(\"------------\")\n", + " print(\"------------\")\n", + " for e in group[1:]:\n", + " print(group[0][\"created_at\"])\n", + " print(f\"with review state: {group[0]['review.state']} PR state {group[0]['pull_request.merged']}\")\n", + " print(e[\"comment.body\"])\n", + " poster_name = e['actor.login'] or e['issue.author'] or e['user.login']\n", + " print(f\"User: {poster_name}, action: {e['action']}\")\n", + " print(e[\"type\"])\n", + " print(\"------------\")\n", + " print(\"------------end multiple\")" + ] + }, + { + "cell_type": "code", + "execution_count": 225, + "metadata": {}, + "outputs": [], + "source": [ + "def create_grouped_events(events):\n", + " df = pd.DataFrame(events)\n", + " df['created_at'] = pd.to_datetime(df['created_at'])\n", + " df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n", + " # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n", + " if len(df) == 1:\n", + " grouped_events = [[df.iloc[0].to_dict()]]\n", + " else:\n", + " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'pull_request.state'], dropna=False)]\n", + " return grouped_events\n", + "\n", + "def format_body(text, user, is_op=False):\n", + " color = \"#007bff\" if is_op else \"black\"\n", + " pr_body = f\"
👤{user}: {text}
\"\n", + " return pr_body" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import pandas as pd\n", + "\n", + "def create_grouped_events(events):\n", + " df = pd.DataFrame(events)\n", + " \n", + " # Ensure it's in datetime format\n", + " df['created_at'] = pd.to_datetime(df['created_at'])\n", + " # Preserve the original order\n", + " df['order'] = range(len(df))\n", + "\n", + " # Create a new column 'uuid' initialized with None\n", + " df['uuid'] = None\n", + "\n", + " # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n", + " mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n", + " df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n", + "\n", + " # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n", + " grouped_events = [group.drop(columns=['uuid', 'order']).to_dict(orient='records') \n", + " for _, group in df.sort_values(by='order').groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n", + " # soert on created_at\n", + " grouped_events = [sorted(group, key=lambda x: x['created_at']) for group in grouped_events]\n", + " return grouped_events\n", + "\n", + "\n", + "\n", + "grouped_events = create_grouped_events(events)\n", + "c = 0\n", + "thread_html = \"\"\n", + "for thread in grouped_events:\n", + " # Start a new thread\n", + " #print(thread)\n", + " if thread[0][\"action\"] == \"opened\":\n", + " continue\n", + " thread_html += '
'\n", + " # Get the first event in the thread as a reference\n", + " first_event = thread[0]\n", + " poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n", + " # Add shared parts of the events only once\n", + " text = f\"\"\"\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event Type{first_event['type']}
User{poster_name} (type :{first_event['user.type']})
Action{first_event['action']}
Review State{first_event['review.state']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
From Head{sample[\"head_repo_info\"]['pull_request.head.label']}
\n", + "
\n", + " \"\"\"\n", + " thread_html += text\n", + " # add horizontal line\n", + " thread_html += '
'\n", + " for event in thread:\n", + " # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n", + " poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n", + " if event['comment.body'] or event[\"issue.comment\"]:\n", + " is_op = original_poster == poster_name\n", + " thread_html += format_body(event['comment.body'], poster_name, is_op)\n", + "\n", + " thread_html += '
'" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 218, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(grouped_events)" + ] + }, + { + "cell_type": "code", + "execution_count": 221, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[{'action': 'created',\n", + " 'actor.id': nan,\n", + " 'actor.login': None,\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': Timestamp('2022-02-01 00:05:19+0000', tz='UTC'),\n", + " 'issue.author': 'kasobol-msft',\n", + " 'issue.comment': \"This won't work well because it includes dependencies in \"\n", + " 'output like this:\\r\\n'\n", + " '![image](https://user-images.githubusercontent.com/61715331/151893024-ef3e99d9-0d83-44c6-839b-966550320642.png)\\r\\n'\n", + " '\\r\\n'\n", + " \"There's hacky way to side step this:\\r\\n\"\n", + " '![image](https://user-images.githubusercontent.com/61715331/151893056-8d018cb9-2f0d-4c7d-8848-eb9df9028b88.png)\\r\\n'\n", + " '\\r\\n'\n", + " 'But it would require be explicit about each dependency in '\n", + " 'each sdk to be precise and not risk any \"dependency doc '\n", + " 'leaks\".',\n", + " 'issue.comment_id': 1026335328.0,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': None,\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'comment',\n", + " 'user.login': None,\n", + " 'user.type': None}],\n", + " [{'action': 'opened',\n", + " 'actor.id': 61715331.0,\n", + " 'actor.login': 'kasobol-msft',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': Timestamp('2022-01-31 22:51:21+0000', tz='UTC'),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': nan,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}],\n", + " [{'action': 'opened',\n", + " 'actor.id': nan,\n", + " 'actor.login': None,\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': Timestamp('2022-01-31 22:51:20+0000', tz='UTC'),\n", + " 'issue.author': 'kasobol-msft',\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': nan,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': None,\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'issue',\n", + " 'user.login': None,\n", + " 'user.type': None}],\n", + " [{'action': 'closed',\n", + " 'actor.id': 61715331.0,\n", + " 'actor.login': 'kasobol-msft',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': Timestamp('2022-02-01 00:05:20+0000', tz='UTC'),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': nan,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}]]\n" + ] + } + ], + "source": [ + "pprint(grouped_events)" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event TypePullRequestEvent
Userkasobol-msft (type :None)
Actionclosed
Review StateNone
PR Stateclosed, merged: False
From HeadAzure:kasobol-msft-patch-1
\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import HTML, display\n", + "display(HTML(thread_html))" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.description': None,\n", + " 'pull_request.base.repo.forks_count': 0,\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.language': 'TypeScript',\n", + " 'pull_request.base.repo.license.name': None,\n", + " 'pull_request.base.repo.name': 'WuzuufMasr',\n", + " 'pull_request.base.repo.open_issues_count': 24,\n", + " 'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n", + " 'pull_request.base.repo.owner.type': 'User',\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.base.repo.stargazers_count': 0,\n", + " 'pull_request.base.repo.watchers_count': 0,\n", + " 'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n", + " 'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n", + " 'pull_request.base.user.type': 'User',\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.label.name': None,\n", + " 'pull_request.review_comments': 0}" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_ds[0][\"base_repo_info\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "ds = merged_ds" + ] + }, + { + "cell_type": "code", + "execution_count": 321, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'__index_level_0__': 175,\n", + " 'bucket': '940',\n", + " 'pull_request.code_review_events': None,\n", + " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n", + " '\"actor.login\": \"pkarman\", \"actor.id\": 1205061, '\n", + " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", + " 'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n", + " '\"public\": true, \"created_at\": \"2015-11-23T19:16:36Z\", '\n", + " '\"org.id\": 6233994, \"org.login\": \"18F\", '\n", + " '\"pull_request.id\": 51566831, \"pull_request.number\": '\n", + " '820, \"pull_request.state\": \"open\", '\n", + " '\"pull_request.title\": \"rename elk services to '\n", + " 'workaround blue-green deploy bug\", '\n", + " '\"pull_request.body\": \"there\\'s a bug in the '\n", + " 'cf-blue-green deploy that gets a false positive match '\n", + " 'based on the current ELK naming convention. I have '\n", + " 're-named all our ELK services to workaround that '\n", + " 'bug.\", \"pull_request.user.login\": \"pkarman\", '\n", + " '\"pull_request.user.id\": 1205061, '\n", + " '\"pull_request.author_association\": null, '\n", + " '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n", + " '\"pull_request.updated_at\": \"2015-11-23T19:16:34Z\", '\n", + " '\"pull_request.closed_at\": null, '\n", + " '\"pull_request.merged_at\": null, '\n", + " '\"pull_request.merge_commit_sha\": '\n", + " '\"4b1557970247cde19eb3ea3992c324174d49a3d7\", '\n", + " '\"pull_request.locked\": false, '\n", + " '\"pull_request.assignee.login\": null, '\n", + " '\"pull_request.assignee.id\": null, '\n", + " '\"pull_request.assignee.type\": null, '\n", + " '\"pull_request.assignee.site_admin\": null, '\n", + " '\"pull_request.milestone.id\": null, '\n", + " '\"pull_request.milestone.number\": null, '\n", + " '\"pull_request.milestone.title\": null, '\n", + " '\"pull_request.milestone.description\": null, '\n", + " '\"pull_request.milestone.creator.login\": null, '\n", + " '\"pull_request.milestone.creator.id\": null, '\n", + " '\"pull_request.milestone.creator.type\": null, '\n", + " '\"pull_request.milestone.creator.site_admin\": null, '\n", + " '\"pull_request.milestone.open_issues\": null, '\n", + " '\"pull_request.milestone.closed_issues\": null, '\n", + " '\"pull_request.milestone.state\": null, '\n", + " '\"pull_request.milestone.created_at\": null, '\n", + " '\"pull_request.milestone.updated_at\": null, '\n", + " '\"pull_request.milestone.due_on\": null, '\n", + " '\"pull_request.milestone.closed_at\": null, '\n", + " '\"pull_request.merged\": false, '\n", + " '\"pull_request.mergeable\": true, '\n", + " '\"pull_request.mergeable_state\": \"clean\", '\n", + " '\"pull_request.merged_by.login\": null, '\n", + " '\"pull_request.merged_by.id\": null, '\n", + " '\"pull_request.merged_by.type\": null, '\n", + " '\"pull_request.merged_by.site_admin\": null, '\n", + " '\"pull_request.comments\": 0, '\n", + " '\"pull_request.review_comments\": 0, '\n", + " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", + " '3, \"pull_request.deletions\": 3, '\n", + " '\"pull_request.changed_files\": 1, '\n", + " '\"pull_request.label.id\": null, '\n", + " '\"pull_request.label.name\": null, '\n", + " '\"pull_request.label.color\": null, '\n", + " '\"pull_request.label.default\": null, '\n", + " '\"pull_request.head.label\": \"18F:elk-rename\", '\n", + " '\"pull_request.head.ref\": \"elk-rename\", '\n", + " '\"pull_request.head.sha\": '\n", + " '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n", + " '\"pull_request.head.user.login\": \"18F\", '\n", + " '\"pull_request.head.user.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.name\": \"C2\", '\n", + " '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n", + " '\"pull_request.head.repo.owner.login\": \"18F\", '\n", + " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.private\": false, '\n", + " '\"pull_request.head.repo.homepage\": '\n", + " '\"https://cap.18f.gov\", '\n", + " '\"pull_request.head.repo.description\": \"an approval '\n", + " 'process automation tool\", '\n", + " '\"pull_request.head.repo.fork\": false, '\n", + " '\"pull_request.head.repo.created_at\": '\n", + " '\"2014-03-28T05:15:23Z\", '\n", + " '\"pull_request.head.repo.updated_at\": '\n", + " '\"2015-11-06T02:16:44Z\", '\n", + " '\"pull_request.head.repo.pushed_at\": '\n", + " '\"2015-11-23T19:16:35Z\", '\n", + " '\"pull_request.head.repo.size\": 81432, '\n", + " '\"pull_request.head.repo.stargazers_count\": 31, '\n", + " '\"pull_request.head.repo.watchers_count\": 31, '\n", + " '\"pull_request.head.repo.language\": \"Ruby\", '\n", + " '\"pull_request.head.repo.has_issues\": true, '\n", + " '\"pull_request.head.repo.has_projects\": null, '\n", + " '\"pull_request.head.repo.has_downloads\": true, '\n", + " '\"pull_request.head.repo.has_wiki\": false, '\n", + " '\"pull_request.head.repo.has_pages\": false, '\n", + " '\"pull_request.head.repo.forks_count\": 16, '\n", + " '\"pull_request.head.repo.archived\": null, '\n", + " '\"pull_request.head.repo.disabled\": null, '\n", + " '\"pull_request.head.repo.open_issues_count\": 6, '\n", + " '\"pull_request.head.repo.forks\": 16, '\n", + " '\"pull_request.head.repo.open_issues\": 6, '\n", + " '\"pull_request.head.repo.watchers\": 31, '\n", + " '\"pull_request.head.repo.default_branch\": \"master\", '\n", + " '\"pull_request.head.repo.license.key\": null, '\n", + " '\"pull_request.head.repo.license.spdx_id\": null, '\n", + " '\"pull_request.head.repo.license.name\": null, '\n", + " '\"pull_request.base.label\": \"18F:master\", '\n", + " '\"pull_request.base.ref\": \"master\", '\n", + " '\"pull_request.base.sha\": '\n", + " '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n", + " '\"pull_request.base.user.login\": \"18F\", '\n", + " '\"pull_request.base.user.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.name\": \"C2\", '\n", + " '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n", + " '\"pull_request.base.repo.owner.login\": \"18F\", '\n", + " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.private\": false, '\n", + " '\"pull_request.base.repo.homepage\": '\n", + " '\"https://cap.18f.gov\", '\n", + " '\"pull_request.base.repo.description\": \"an approval '\n", + " 'process automation tool\", '\n", + " '\"pull_request.base.repo.fork\": false, '\n", + " '\"pull_request.base.repo.created_at\": '\n", + " '\"2014-03-28T05:15:23Z\", '\n", + " '\"pull_request.base.repo.updated_at\": '\n", + " '\"2015-11-06T02:16:44Z\", '\n", + " '\"pull_request.base.repo.pushed_at\": '\n", + " '\"2015-11-23T19:16:35Z\", '\n", + " '\"pull_request.base.repo.size\": 81432, '\n", + " '\"pull_request.base.repo.stargazers_count\": 31, '\n", + " '\"pull_request.base.repo.watchers_count\": 31, '\n", + " '\"pull_request.base.repo.language\": \"Ruby\", '\n", + " '\"pull_request.base.repo.has_issues\": true, '\n", + " '\"pull_request.base.repo.has_projects\": null, '\n", + " '\"pull_request.base.repo.has_downloads\": true, '\n", + " '\"pull_request.base.repo.has_wiki\": false, '\n", + " '\"pull_request.base.repo.has_pages\": false, '\n", + " '\"pull_request.base.repo.forks_count\": 16, '\n", + " '\"pull_request.base.repo.archived\": null, '\n", + " '\"pull_request.base.repo.disabled\": null, '\n", + " '\"pull_request.base.repo.open_issues_count\": 6, '\n", + " '\"pull_request.base.repo.forks\": 16, '\n", + " '\"pull_request.base.repo.open_issues\": 6, '\n", + " '\"pull_request.base.repo.watchers\": 31, '\n", + " '\"pull_request.base.repo.default_branch\": \"master\", '\n", + " '\"pull_request.base.repo.license.key\": null, '\n", + " '\"pull_request.base.repo.license.spdx_id\": null, '\n", + " '\"pull_request.base.repo.license.name\": null, '\n", + " '\"pull_request.guid\": \"18F/C2/pull/820\"}, {\"type\": '\n", + " '\"PullRequestEvent\", \"action\": \"closed\", '\n", + " '\"actor.login\": \"jessieay\", \"actor.id\": 601515, '\n", + " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", + " 'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n", + " '\"public\": true, \"created_at\": \"2015-11-23T22:09:46Z\", '\n", + " '\"org.id\": 6233994, \"org.login\": \"18F\", '\n", + " '\"pull_request.id\": 51566831, \"pull_request.number\": '\n", + " '820, \"pull_request.state\": \"closed\", '\n", + " '\"pull_request.title\": \"rename elk services to '\n", + " 'workaround blue-green deploy bug\", '\n", + " '\"pull_request.body\": \"there\\'s a bug in the '\n", + " 'cf-blue-green deploy that gets a false positive match '\n", + " 'based on the current ELK naming convention. I have '\n", + " 're-named all our ELK services to workaround that '\n", + " 'bug.\", \"pull_request.user.login\": \"pkarman\", '\n", + " '\"pull_request.user.id\": 1205061, '\n", + " '\"pull_request.author_association\": null, '\n", + " '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n", + " '\"pull_request.updated_at\": \"2015-11-23T22:09:45Z\", '\n", + " '\"pull_request.closed_at\": \"2015-11-23T22:09:45Z\", '\n", + " '\"pull_request.merged_at\": \"2015-11-23T22:09:45Z\", '\n", + " '\"pull_request.merge_commit_sha\": '\n", + " '\"6d3c30d429a49321552973b81e1ef4cd3073157f\", '\n", + " '\"pull_request.locked\": false, '\n", + " '\"pull_request.assignee.login\": null, '\n", + " '\"pull_request.assignee.id\": null, '\n", + " '\"pull_request.assignee.type\": null, '\n", + " '\"pull_request.assignee.site_admin\": null, '\n", + " '\"pull_request.milestone.id\": null, '\n", + " '\"pull_request.milestone.number\": null, '\n", + " '\"pull_request.milestone.title\": null, '\n", + " '\"pull_request.milestone.description\": null, '\n", + " '\"pull_request.milestone.creator.login\": null, '\n", + " '\"pull_request.milestone.creator.id\": null, '\n", + " '\"pull_request.milestone.creator.type\": null, '\n", + " '\"pull_request.milestone.creator.site_admin\": null, '\n", + " '\"pull_request.milestone.open_issues\": null, '\n", + " '\"pull_request.milestone.closed_issues\": null, '\n", + " '\"pull_request.milestone.state\": null, '\n", + " '\"pull_request.milestone.created_at\": null, '\n", + " '\"pull_request.milestone.updated_at\": null, '\n", + " '\"pull_request.milestone.due_on\": null, '\n", + " '\"pull_request.milestone.closed_at\": null, '\n", + " '\"pull_request.merged\": true, '\n", + " '\"pull_request.mergeable\": null, '\n", + " '\"pull_request.mergeable_state\": \"unknown\", '\n", + " '\"pull_request.merged_by.login\": \"jessieay\", '\n", + " '\"pull_request.merged_by.id\": 601515, '\n", + " '\"pull_request.merged_by.type\": \"User\", '\n", + " '\"pull_request.merged_by.site_admin\": false, '\n", + " '\"pull_request.comments\": 1, '\n", + " '\"pull_request.review_comments\": 0, '\n", + " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", + " '3, \"pull_request.deletions\": 3, '\n", + " '\"pull_request.changed_files\": 1, '\n", + " '\"pull_request.label.id\": null, '\n", + " '\"pull_request.label.name\": null, '\n", + " '\"pull_request.label.color\": null, '\n", + " '\"pull_request.label.default\": null, '\n", + " '\"pull_request.head.label\": \"18F:elk-rename\", '\n", + " '\"pull_request.head.ref\": \"elk-rename\", '\n", + " '\"pull_request.head.sha\": '\n", + " '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n", + " '\"pull_request.head.user.login\": \"18F\", '\n", + " '\"pull_request.head.user.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.name\": \"C2\", '\n", + " '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n", + " '\"pull_request.head.repo.owner.login\": \"18F\", '\n", + " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.head.repo.private\": false, '\n", + " '\"pull_request.head.repo.homepage\": '\n", + " '\"https://cap.18f.gov\", '\n", + " '\"pull_request.head.repo.description\": \"an approval '\n", + " 'process automation tool\", '\n", + " '\"pull_request.head.repo.fork\": false, '\n", + " '\"pull_request.head.repo.created_at\": '\n", + " '\"2014-03-28T05:15:23Z\", '\n", + " '\"pull_request.head.repo.updated_at\": '\n", + " '\"2015-11-06T02:16:44Z\", '\n", + " '\"pull_request.head.repo.pushed_at\": '\n", + " '\"2015-11-23T22:09:45Z\", '\n", + " '\"pull_request.head.repo.size\": 81440, '\n", + " '\"pull_request.head.repo.stargazers_count\": 31, '\n", + " '\"pull_request.head.repo.watchers_count\": 31, '\n", + " '\"pull_request.head.repo.language\": \"Ruby\", '\n", + " '\"pull_request.head.repo.has_issues\": true, '\n", + " '\"pull_request.head.repo.has_projects\": null, '\n", + " '\"pull_request.head.repo.has_downloads\": true, '\n", + " '\"pull_request.head.repo.has_wiki\": false, '\n", + " '\"pull_request.head.repo.has_pages\": false, '\n", + " '\"pull_request.head.repo.forks_count\": 16, '\n", + " '\"pull_request.head.repo.archived\": null, '\n", + " '\"pull_request.head.repo.disabled\": null, '\n", + " '\"pull_request.head.repo.open_issues_count\": 4, '\n", + " '\"pull_request.head.repo.forks\": 16, '\n", + " '\"pull_request.head.repo.open_issues\": 4, '\n", + " '\"pull_request.head.repo.watchers\": 31, '\n", + " '\"pull_request.head.repo.default_branch\": \"master\", '\n", + " '\"pull_request.head.repo.license.key\": null, '\n", + " '\"pull_request.head.repo.license.spdx_id\": null, '\n", + " '\"pull_request.head.repo.license.name\": null, '\n", + " '\"pull_request.base.label\": \"18F:master\", '\n", + " '\"pull_request.base.ref\": \"master\", '\n", + " '\"pull_request.base.sha\": '\n", + " '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n", + " '\"pull_request.base.user.login\": \"18F\", '\n", + " '\"pull_request.base.user.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.name\": \"C2\", '\n", + " '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n", + " '\"pull_request.base.repo.owner.login\": \"18F\", '\n", + " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", + " '\"pull_request.base.repo.private\": false, '\n", + " '\"pull_request.base.repo.homepage\": '\n", + " '\"https://cap.18f.gov\", '\n", + " '\"pull_request.base.repo.description\": \"an approval '\n", + " 'process automation tool\", '\n", + " '\"pull_request.base.repo.fork\": false, '\n", + " '\"pull_request.base.repo.created_at\": '\n", + " '\"2014-03-28T05:15:23Z\", '\n", + " '\"pull_request.base.repo.updated_at\": '\n", + " '\"2015-11-06T02:16:44Z\", '\n", + " '\"pull_request.base.repo.pushed_at\": '\n", + " '\"2015-11-23T22:09:45Z\", '\n", + " '\"pull_request.base.repo.size\": 81440, '\n", + " '\"pull_request.base.repo.stargazers_count\": 31, '\n", + " '\"pull_request.base.repo.watchers_count\": 31, '\n", + " '\"pull_request.base.repo.language\": \"Ruby\", '\n", + " '\"pull_request.base.repo.has_issues\": true, '\n", + " '\"pull_request.base.repo.has_projects\": null, '\n", + " '\"pull_request.base.repo.has_downloads\": true, '\n", + " '\"pull_request.base.repo.has_wiki\": false, '\n", + " '\"pull_request.base.repo.has_pages\": false, '\n", + " '\"pull_request.base.repo.forks_count\": 16, '\n", + " '\"pull_request.base.repo.archived\": null, '\n", + " '\"pull_request.base.repo.disabled\": null, '\n", + " '\"pull_request.base.repo.open_issues_count\": 4, '\n", + " '\"pull_request.base.repo.forks\": 16, '\n", + " '\"pull_request.base.repo.open_issues\": 4, '\n", + " '\"pull_request.base.repo.watchers\": 31, '\n", + " '\"pull_request.base.repo.default_branch\": \"master\", '\n", + " '\"pull_request.base.repo.license.key\": null, '\n", + " '\"pull_request.base.repo.license.spdx_id\": null, '\n", + " '\"pull_request.base.repo.license.name\": null, '\n", + " '\"pull_request.guid\": \"18F/C2/pull/820\"}]',\n", + " 'pull_request.guid': '18F/C2/pull/820',\n", + " 'pull_request.issue_events': '{\"repo\": \"18F/C2\", \"org\": \"18F\", \"issue_id\": '\n", + " '118451607, \"issue_number\": 820, \"pull_request\": '\n", + " '{\"number\": 820.0, \"repo\": \"C2\", \"user_login\": '\n", + " '\"18F\"}, \"events\": [{\"action\": \"opened\", '\n", + " '\"author\": \"pkarman\", \"comment\": null, '\n", + " '\"comment_id\": null, \"datetime\": '\n", + " '\"2015-11-23T19:16:34Z\", \"description\": '\n", + " '\"there\\'s a bug in the cf-blue-green deploy '\n", + " 'that gets a false positive match based on the '\n", + " 'current ELK naming convention. I have re-named '\n", + " 'all our ELK services to workaround that bug.\", '\n", + " '\"title\": \"rename elk services to workaround '\n", + " 'blue-green deploy bug\", \"type\": \"issue\"}, '\n", + " '{\"action\": \"created\", \"author\": \"jessieay\", '\n", + " '\"comment\": \"wish there were a good way to write '\n", + " 'tests for this type of thing...\\\\r\\\\n\\\\r\\\\nbut '\n", + " 'LGTM. merging. \", \"comment_id\": 159082113.0, '\n", + " '\"datetime\": \"2015-11-23 22:09:43+00:00\", '\n", + " '\"description\": null, \"title\": null, \"type\": '\n", + " '\"comment\"}]}'}\n" + ] + } + ], + "source": [ + "pprint(small_ds[8])" + ] + }, + { + "cell_type": "code", + "execution_count": 327, + "metadata": {}, + "outputs": [], + "source": [ + "actions = []\n", + "c = 0\n", + "for events in ds[\"events\"]:\n", + " c += 1\n", + " actions.extend([event[\"action\"] for event in events])\n", + " if c > 10000:\n", + " break\n" + ] + }, + { + "cell_type": "code", + "execution_count": 328, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'closed', 'created', 'opened', 'reopened'}" + ] + }, + "execution_count": 328, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(actions)" + ] + }, + { + "cell_type": "code", + "execution_count": 322, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'base_repo_info': {'pull_request.base.label': '1011X:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.description': 'Representing '\n", + " 'rational numbers '\n", + " 'using the '\n", + " 'floating-bar number '\n", + " 'type.',\n", + " 'pull_request.base.repo.forks_count': 2,\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.language': 'Rust',\n", + " 'pull_request.base.repo.license.name': 'Other',\n", + " 'pull_request.base.repo.name': 'floating_bar',\n", + " 'pull_request.base.repo.open_issues_count': 6,\n", + " 'pull_request.base.repo.owner.login': '1011X',\n", + " 'pull_request.base.repo.owner.type': 'User',\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.base.repo.stargazers_count': 15,\n", + " 'pull_request.base.repo.watchers_count': 15,\n", + " 'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n", + " 'pull_request.base.user.login': '1011X',\n", + " 'pull_request.base.user.type': 'User',\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.label.name': None,\n", + " 'pull_request.review_comments': 0},\n", + " 'bucket': '940',\n", + " 'events': [{'action': 'opened',\n", + " 'actor.id': None,\n", + " 'actor.login': None,\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=),\n", + " 'issue.author': 'ZoeyR',\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': None,\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'issue',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'opened',\n", + " 'actor.id': 8010244,\n", + " 'actor.login': 'ZoeyR',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'created',\n", + " 'actor.id': None,\n", + " 'actor.login': None,\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=),\n", + " 'issue.author': '1011X',\n", + " 'issue.comment': 'LGTM, thank you!',\n", + " 'issue.comment_id': 835503633.0,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': None,\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'comment',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'closed',\n", + " 'actor.id': 1851619,\n", + " 'actor.login': '1011X',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.merged_by.login': '1011X',\n", + " 'pull_request.merged_by.type': 'User',\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}],\n", + " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n", + " 'pull_request.head.ref': 'fractional-benches',\n", + " 'pull_request.head.repo.default_branch': 'master',\n", + " 'pull_request.head.repo.description': 'Representing '\n", + " 'rational numbers '\n", + " 'using the '\n", + " 'floating-bar number '\n", + " 'type.',\n", + " 'pull_request.head.repo.homepage': None,\n", + " 'pull_request.head.repo.language': None,\n", + " 'pull_request.head.repo.license.name': 'Other',\n", + " 'pull_request.head.repo.name': 'floating_bar',\n", + " 'pull_request.head.repo.owner.login': 'ZoeyR',\n", + " 'pull_request.head.repo.owner.type': 'User',\n", + " 'pull_request.head.repo.private': False,\n", + " 'pull_request.head.repo.stargazers_count': 0,\n", + " 'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n", + " 'pull_request.head.user.login': 'ZoeyR',\n", + " 'pull_request.head.user.type': 'User'},\n", + " 'pull_request_info': {'org.id': None,\n", + " 'public': True,\n", + " 'pull_request.additions': 23,\n", + " 'pull_request.base.user.type': 'User',\n", + " 'pull_request.body': '',\n", + " 'pull_request.changed_files': 4,\n", + " 'pull_request.closed_at': None,\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.commits': 1,\n", + " 'pull_request.created_at': '2021-05-08T20:30:31Z',\n", + " 'pull_request.deletions': 19,\n", + " 'pull_request.guid': '1011X/floating_bar/pull/7',\n", + " 'pull_request.head.user.type': 'User',\n", + " 'pull_request.id': 634875503,\n", + " 'pull_request.merged_at': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.milestone.description': None,\n", + " 'pull_request.milestone.number': None,\n", + " 'pull_request.milestone.title': None,\n", + " 'pull_request.number': 7,\n", + " 'pull_request.review_comments': 0,\n", + " 'pull_request.state': 'open',\n", + " 'pull_request.title': 'change benches to use fractional '\n", + " 'values',\n", + " 'pull_request.user.id': 8010244,\n", + " 'pull_request.user.login': 'ZoeyR',\n", + " 'repo.id': 166723951,\n", + " 'repo.name': '1011X/floating_bar'}}\n" + ] + } + ], + "source": [ + "pprint(ds[6])" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'bucket': '940',\n", + " 'pull_request_info': {'org.id': None,\n", + " 'public': True,\n", + " 'pull_request.additions': 23,\n", + " 'pull_request.base.user.type': 'User',\n", + " 'pull_request.body': '',\n", + " 'pull_request.changed_files': 4,\n", + " 'pull_request.closed_at': None,\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.commits': 1,\n", + " 'pull_request.created_at': '2021-05-08T20:30:31Z',\n", + " 'pull_request.deletions': 19,\n", + " 'pull_request.guid': '1011X/floating_bar/pull/7',\n", + " 'pull_request.head.user.type': 'User',\n", + " 'pull_request.id': 634875503,\n", + " 'pull_request.merged_at': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.milestone.description': None,\n", + " 'pull_request.milestone.number': None,\n", + " 'pull_request.milestone.title': None,\n", + " 'pull_request.number': 7,\n", + " 'pull_request.review_comments': 0,\n", + " 'pull_request.state': 'open',\n", + " 'pull_request.title': 'change benches to use fractional values',\n", + " 'pull_request.user.id': 8010244,\n", + " 'pull_request.user.login': 'ZoeyR',\n", + " 'repo.id': 166723951,\n", + " 'repo.name': '1011X/floating_bar'},\n", + " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n", + " 'pull_request.head.ref': 'fractional-benches',\n", + " 'pull_request.head.repo.default_branch': 'master',\n", + " 'pull_request.head.repo.description': 'Representing rational numbers using the floating-bar number type.',\n", + " 'pull_request.head.repo.homepage': None,\n", + " 'pull_request.head.repo.language': None,\n", + " 'pull_request.head.repo.license.name': 'Other',\n", + " 'pull_request.head.repo.name': 'floating_bar',\n", + " 'pull_request.head.repo.owner.login': 'ZoeyR',\n", + " 'pull_request.head.repo.owner.type': 'User',\n", + " 'pull_request.head.repo.private': False,\n", + " 'pull_request.head.repo.stargazers_count': 0,\n", + " 'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n", + " 'pull_request.head.user.login': 'ZoeyR',\n", + " 'pull_request.head.user.type': 'User'},\n", + " 'base_repo_info': {'pull_request.base.label': '1011X:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.description': 'Representing rational numbers using the floating-bar number type.',\n", + " 'pull_request.base.repo.forks_count': 2,\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.language': 'Rust',\n", + " 'pull_request.base.repo.license.name': 'Other',\n", + " 'pull_request.base.repo.name': 'floating_bar',\n", + " 'pull_request.base.repo.open_issues_count': 6,\n", + " 'pull_request.base.repo.owner.login': '1011X',\n", + " 'pull_request.base.repo.owner.type': 'User',\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.base.repo.stargazers_count': 15,\n", + " 'pull_request.base.repo.watchers_count': 15,\n", + " 'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n", + " 'pull_request.base.user.login': '1011X',\n", + " 'pull_request.base.user.type': 'User',\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.label.name': None,\n", + " 'pull_request.review_comments': 0},\n", + " 'events': [{'action': 'opened',\n", + " 'actor.id': None,\n", + " 'actor.login': None,\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=),\n", + " 'issue.author': 'ZoeyR',\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': None,\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'issue',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'opened',\n", + " 'actor.id': 8010244,\n", + " 'actor.login': 'ZoeyR',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': 'open',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'created',\n", + " 'actor.id': None,\n", + " 'actor.login': None,\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=),\n", + " 'issue.author': '1011X',\n", + " 'issue.comment': 'LGTM, thank you!',\n", + " 'issue.comment_id': 835503633.0,\n", + " 'pull_request.merged': None,\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.state': None,\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'comment',\n", + " 'user.login': None,\n", + " 'user.type': None},\n", + " {'action': 'closed',\n", + " 'actor.id': 1851619,\n", + " 'actor.login': '1011X',\n", + " 'comment.author_association': None,\n", + " 'comment.body': None,\n", + " 'comment.commit_id': None,\n", + " 'comment.created_at': None,\n", + " 'comment.diff_hunk': None,\n", + " 'comment.id': None,\n", + " 'comment.in_reply_to_id': None,\n", + " 'comment.line': None,\n", + " 'comment.original_commit_id': None,\n", + " 'comment.original_line': None,\n", + " 'comment.original_position': None,\n", + " 'comment.original_start_line': None,\n", + " 'comment.path': None,\n", + " 'comment.position': None,\n", + " 'comment.side': None,\n", + " 'comment.start_line': None,\n", + " 'comment.start_side': None,\n", + " 'comment.updated_at': None,\n", + " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=),\n", + " 'issue.author': None,\n", + " 'issue.comment': None,\n", + " 'issue.comment_id': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.merged_by.login': '1011X',\n", + " 'pull_request.merged_by.type': 'User',\n", + " 'pull_request.state': 'closed',\n", + " 'review.author_association': None,\n", + " 'review.body': None,\n", + " 'review.commit_id': None,\n", + " 'review.id': None,\n", + " 'review.state': None,\n", + " 'review.submitted_at': None,\n", + " 'type': 'PullRequestEvent',\n", + " 'user.login': None,\n", + " 'user.type': None}]}" + ] + }, + "execution_count": 318, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample = ds[6]\n", + "sample" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "sample = ds[0]\n", + "pr_info = sample[\"pull_request_info\"]\n", + "head_info = sample[\"head_repo_info\"]\n", + "base_info = sample[\"base_repo_info\"]\n", + "events = sample[\"events\"]\n", + "\n", + "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", + "\n", + "header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}
\n", + "📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.
\n", + "Link: [{gh_link}]({gh_link})\"\"\"\n", + "pr_info_html = f\"\"\"\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AttributeDetail
🧾 PR Type{events[0]['type']}
🟢 PR State{pr_info['pull_request.state']}
👤 PR Author{pr_info['pull_request.user.login']}
🏷️ Head Branchref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}
🌳 Base Branch{base_info['pull_request.base.ref']}
\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AttributeDetail
🧾 PR TypePullRequestEvent
🟢 PR Stateopen
👤 PR Authordependabot[bot]
🏷️ Head Branchref: dependabot/npm_and_yarn/qs-6.5.3, label: AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3
🌳 Base Branchmaster
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# display pr_info_html as HTML\n", + "from IPython.display import HTML, display\n", + "display(HTML(pr_info_html))" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
actioncommentscreated_attype
0opened{'actor.id': 49699333, 'actor.login': 'dependa...2022-12-10 03:27:08+00:00PullRequestEvent
\n", + "
" + ], + "text/plain": [ + " action comments \n", + "0 opened {'actor.id': 49699333, 'actor.login': 'dependa... \\\n", + "\n", + " created_at type \n", + "0 2022-12-10 03:27:08+00:00 PullRequestEvent " + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame(events)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event TypePullRequestEvent
UserNone (type :None)
Review StateNone
From HeadAbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3
\n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def create_grouped_events(events):\n", + " df = pd.DataFrame(events)\n", + " df['created_at'] = pd.to_datetime(df['created_at'])\n", + " df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n", + " # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n", + " if len(df) == 1:\n", + " grouped_events = [[df.iloc[0].to_dict()]]\n", + " else:\n", + " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n", + " return grouped_events\n", + " \n", + "events = sample[\"events\"]\n", + "grouped_events = create_grouped_events(events)\n", + "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", + "for thread in grouped_events:\n", + " # Start a new thread\n", + " thread_html = '
'\n", + " # Get the first event in the thread as a reference\n", + " first_event = thread[0]\n", + " \n", + " # Add shared parts of the events only once\n", + " text = f\"\"\"\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event Type{first_event['type']}
User{first_event['user.login']} (type :{first_event['user.type']})
Review State{first_event['review.state']}
From Head{head_info['pull_request.head.label']}
\n", + "
\n", + " \"\"\"\n", + " thread_html += text\n", + " \n", + " # Add the bodies of the comments for each event in the thread\n", + " for event in thread:\n", + " if event['comment.body']:\n", + " is_op = original_poster == event['user.login']\n", + " thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n", + " thread_html += '
'\n", + " display(HTML(thread_html))\n", + " if first_event['comment.path']:\n", + " path_html = f\"Path: {first_event['comment.path']}\"\n", + " display(HTML(path_html))\n", + " if first_event[\"comment.diff_hunk\"]:\n", + " print(first_event[\"comment.diff_hunk\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'dependabot[bot]'" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample[\"pull_request_info\"]['pull_request.user.login']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import ghdiff\n", + "import streamlit as st\n", + "import streamlit.components.v1 as components\n", + "from datasets import load_dataset\n", + "\n", + "\n", + "# save dataset as in \"bigcode/code_reviews_sample\"\n", + "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n", + "size = len(ds)\n", + "\n", + "def show_diff_hunk(diff_hunk, position, context=5):\n", + " # exclude the first line with the @@ notation\n", + " lines = diff_hunk.split('\\n')\n", + " start_line = max(int(position) - context - 1, 0)\n", + " end_line = int(position)\n", + " actual_diff = lines[0] + '\\n' + '\\n'.join(lines[start_line + 1:end_line + 1])\n", + " focus = ghdiff.colorize(actual_diff)\n", + " full = ghdiff.colorize(diff_hunk)\n", + " # Wrap the diff hunk inside a scrollable div\n", + " scrollable_focus = f'
{focus}
'\n", + " scrollable_full = f'
{full}
'\n", + " if len(lines) <= 12:\n", + " return None, scrollable_full\n", + " return scrollable_focus, scrollable_full\n", + "\n", + "\n", + "def format_body(text, user, is_op=False):\n", + " color = \"#007bff\" if is_op else \"black\"\n", + " pr_body = f\"
👤{user}: {text}
\"\n", + " return pr_body\n", + "\n", + "\n", + "def create_grouped_events(events):\n", + " df = pd.DataFrame(events)\n", + " df['created_at'] = pd.to_datetime(df['created_at'])\n", + " df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n", + " # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n", + " if len(df) == 1:\n", + " grouped_events = [[df.iloc[0].to_dict()]]\n", + " else:\n", + " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n", + " return grouped_events\n", + "\n", + "\n", + "def get_pr_info(sample):\n", + " pr_info = sample[\"pull_request_info\"]\n", + " head_info = sample[\"head_repo_info\"]\n", + " base_info = sample[\"base_repo_info\"]\n", + " events = sample[\"events\"]\n", + "\n", + " gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", + " \n", + " header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}
\n", + " 📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.
\n", + " Link: [{gh_link}]({gh_link})\"\"\"\n", + " pr_info_html = f\"\"\"\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AttributeDetail
🧾 PR Type{events[0]['type']}
🟢 PR State{pr_info['pull_request.state']}
👤 PR Author{pr_info['pull_request.user.login']}
🏷️ Head Branchref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}
🌳 Base Branch{base_info['pull_request.base.ref']}
\n", + " \"\"\"\n", + " return header, pr_info_html\n", + "\n", + "\n", + "def display_events(sample):\n", + " events = sample[\"events\"]\n", + " grouped_events = create_grouped_events(events)\n", + " original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", + " for thread in grouped_events:\n", + " # Start a new thread\n", + " thread_html = '
'\n", + " # Get the first event in the thread as a reference\n", + " first_event = thread[0]\n", + " \n", + " # Add shared parts of the events only once\n", + " text = f\"\"\"\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Event Type{first_event['type']}
User{first_event['user.login']} (type :{first_event['user.type']})
Review State{first_event['review.state']}
From Head{first_event['pull_request.head.label']}
\n", + "
\n", + " \"\"\"\n", + " thread_html += text\n", + " \n", + " # Add the bodies of the comments for each event in the thread\n", + " for event in thread:\n", + " if event['comment.body']:\n", + " is_op = original_poster == event['user.login']\n", + " thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n", + " thread_html += '
'\n", + " st.markdown(thread_html, unsafe_allow_html=True)\n", + " if first_event['comment.path']:\n", + " path_html = f\"Path: {first_event['comment.path']}\"\n", + " st.markdown(path_html, unsafe_allow_html=True)\n", + " if first_event[\"comment.diff_hunk\"]:\n", + " focus_diff, full_diff = show_diff_hunk(first_event[\"comment.diff_hunk\"], first_event[\"comment.original_position\"])\n", + " if not focus_diff:\n", + " components.html(full_diff)\n", + " else:\n", + " components.html(focus_diff)\n", + " with st.expander(\"View Full diff hunk\"):\n", + " components.html(full_diff)\n", + " st.markdown(\"---\")\n", + "\n", + "def custom_css():\n", + " st.markdown(\"\"\"\n", + " \n", + " \"\"\", unsafe_allow_html=True)\n", + "\n", + "custom_css()\n", + "\n", + "\n", + "#st.set_page_config(page_icon=\":laptop:\", layout=\"wide\")\n", + "st.markdown(f\"\"\"\\\n", + " # GitHub Code Reviews Inspection 🔍\n", + " In this space you can inspect code reviews from GitHUb Pull Requests. Note that some may have empty text (e.g approval of a PR without a code comment).\n", + " You can find the dataset at [bigcode/code_reviews_sample](https://huggingface.co/datasets/bigcode/code_reviews_sample)\n", + " \"\"\"\n", + " )\n", + "example_index = st.number_input(f\"Example (0 to {size-1}):\", min_value=0, max_value=size-1, value=0, step=1)\n", + "\n", + "header, pr_info_html = get_pr_info(ds[example_index])\n", + "st.subheader(\"PR information\")\n", + "st.markdown(header, unsafe_allow_html=True)\n", + "st.markdown(pr_info_html, unsafe_allow_html=True)\n", + "st.markdown(\"
\", unsafe_allow_html=True)\n", + "st.subheader(\"Code review events\")\n", + "event_blocks = display_events(ds[example_index])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ValueError: The features can't be aligned because the key pull_request_info of features {'pull_request.guid': Value(dtype='string', id=None), 'pull_request.code_review_events': Value(dtype='string', id=None), 'pull_request.events': Value(dtype='string', id=None), 'pull_request.issue_events': Value(dtype='string', id=None), 'bucket': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'pull_request_info': {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)}, 'head_repo_info': {'pull_request.head.label': Value(dtype='string', id=None), 'pull_request.head.ref': Value(dtype='string', id=None), 'pull_request.head.repo.default_branch': Value(dtype='string', id=None), 'pull_request.head.repo.description': Value(dtype='null', id=None), 'pull_request.head.repo.homepage': Value(dtype='null', id=None), 'pull_request.head.repo.language': Value(dtype='string', id=None), 'pull_request.head.repo.license.name': Value(dtype='null', id=None), 'pull_request.head.repo.name': Value(dtype='string', id=None), 'pull_request.head.repo.owner.login': Value(dtype='string', id=None), 'pull_request.head.repo.owner.type': Value(dtype='string', id=None), 'pull_request.head.repo.private': Value(dtype='bool', id=None), 'pull_request.head.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.head.sha': Value(dtype='string', id=None), 'pull_request.head.user.login': Value(dtype='string', id=None), 'pull_request.head.user.type': Value(dtype='string', id=None)}, 'base_repo_info': {'pull_request.base.label': Value(dtype='string', id=None), 'pull_request.base.ref': Value(dtype='string', id=None), 'pull_request.base.repo.default_branch': Value(dtype='string', id=None), 'pull_request.base.repo.description': Value(dtype='null', id=None), 'pull_request.base.repo.forks_count': Value(dtype='int64', id=None), 'pull_request.base.repo.homepage': Value(dtype='null', id=None), 'pull_request.base.repo.language': Value(dtype='string', id=None), 'pull_request.base.repo.license.name': Value(dtype='null', id=None), 'pull_request.base.repo.name': Value(dtype='string', id=None), 'pull_request.base.repo.open_issues_count': Value(dtype='int64', id=None), 'pull_request.base.repo.owner.login': Value(dtype='string', id=None), 'pull_request.base.repo.owner.type': Value(dtype='string', id=None), 'pull_request.base.repo.private': Value(dtype='bool', id=None), 'pull_request.base.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.base.repo.watchers_count': Value(dtype='int64', id=None), 'pull_request.base.sha': Value(dtype='string', id=None), 'pull_request.base.user.login': Value(dtype='string', id=None), 'pull_request.base.user.type': Value(dtype='string', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.label.name': Value(dtype='null', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None)}, 'events': [{'action': Value(dtype='string', id=None), 'created_at': Value(dtype='timestamp[us, tz=UTC]', id=None), 'issues_comments': {'action': Value(dtype='string', id=None), 'author': Value(dtype='null', id=None), 'comment': Value(dtype='null', id=None), 'comment_id': Value(dtype='null', id=None), 'datetime': Value(dtype='null', id=None), 'type': Value(dtype='string', id=None)}, 'review_comments': {'actor.id': Value(dtype='int64', id=None), 'actor.login': Value(dtype='string', id=None), 'comment.author_association': Value(dtype='null', id=None), 'comment.body': Value(dtype='null', id=None), 'comment.commit_id': Value(dtype='null', id=None), 'comment.created_at': Value(dtype='null', id=None), 'comment.diff_hunk': Value(dtype='null', id=None), 'comment.id': Value(dtype='null', id=None), 'comment.in_reply_to_id': Value(dtype='null', id=None), 'comment.line': Value(dtype='null', id=None), 'comment.original_commit_id': Value(dtype='null', id=None), 'comment.original_line': Value(dtype='null', id=None), 'comment.original_position': Value(dtype='null', id=None), 'comment.original_start_line': Value(dtype='null', id=None), 'comment.path': Value(dtype='null', id=None), 'comment.position': Value(dtype='null', id=None), 'comment.side': Value(dtype='null', id=None), 'comment.start_line': Value(dtype='null', id=None), 'comment.start_side': Value(dtype='null', id=None), 'comment.updated_at': Value(dtype='null', id=None), 'review.author_association': Value(dtype='null', id=None), 'review.body': Value(dtype='null', id=None), 'review.commit_id': Value(dtype='null', id=None), 'review.id': Value(dtype='null', id=None), 'review.state': Value(dtype='null', id=None), 'review.submitted_at': Value(dtype='null', id=None), 'user.login': Value(dtype='null', id=None), 'user.type': Value(dtype='null', id=None)}, 'type': Value(dtype='string', id=None)}]} has unexpected type - {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} (expected either {'org.id': Value(dtype='null', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} or Value(\"null\").\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'type': 'PullRequestEvent',\n", + " 'action': 'opened',\n", + " 'actor.login': 'dependabot[bot]',\n", + " 'actor.id': 49699333,\n", + " 'user.login': None,\n", + " 'user.id': None,\n", + " 'user.type': None,\n", + " 'repo.name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n", + " 'repo.id': 210433834,\n", + " 'public': True,\n", + " 'created_at': '2022-12-10T03:27:08Z',\n", + " 'org.id': None,\n", + " 'org.login': None,\n", + " 'pull_request.id': 1157080683,\n", + " 'pull_request.number': 35,\n", + " 'pull_request.state': 'open',\n", + " 'pull_request.title': 'Bump qs from 6.5.2 to 6.5.3',\n", + " 'pull_request.body': 'Bumps [qs](https://github.com/ljharb/qs) from 6.5.2 to 6.5.3.\\n
\\nChangelog\\n

Sourced from qs\\'s changelog.

\\n
\\n

6.5.3

\\n
    \\n
  • [Fix] parse: ignore __proto__ keys (#428)
  • \\n
  • [Fix] utils.merge: avoid a crash with a null target and a truthy non-array source
  • \\n
  • [Fix] correctly parse nested arrays
  • \\n
  • [Fix] stringify: fix a crash with strictNullHandling and a custom filter/serializeDate (#279)
  • \\n
  • [Fix] utils: merge: fix crash when source is a truthy primitive & no options are provided
  • \\n
  • [Fix] when parseArrays is false, properly handle keys ending in []
  • \\n
  • [Fix] fix for an impossible situation: when the formatter is called with a non-string value
  • \\n
  • [Fix] utils.merge: avoid a crash with a null target and an array source
  • \\n
  • [Refactor] utils: reduce observable [[Get]]s
  • \\n
  • [Refactor] use cached Array.isArray
  • \\n
  • [Refactor] stringify: Avoid arr = arr.concat(...), push to the existing instance (#269)
  • \\n
  • [Refactor] parse: only need to reassign the var once
  • \\n
  • [Robustness] stringify: avoid relying on a global undefined (#427)
  • \\n
  • [readme] remove travis badge; add github actions/codecov badges; update URLs
  • \\n
  • [Docs] Clean up license text so it’s properly detected as BSD-3-Clause
  • \\n
  • [Docs] Clarify the need for "arrayLimit" option
  • \\n
  • [meta] fix README.md (#399)
  • \\n
  • [meta] add FUNDING.yml
  • \\n
  • [actions] backport actions from main
  • \\n
  • [Tests] always use String(x) over x.toString()
  • \\n
  • [Tests] remove nonexistent tape option
  • \\n
  • [Dev Deps] backport from main
  • \\n
\\n
\\n
\\n
\\nCommits\\n
    \\n
  • 298bfa5 v6.5.3
  • \\n
  • ed0f5dc [Fix] parse: ignore __proto__ keys (#428)
  • \\n
  • 691e739 [Robustness] stringify: avoid relying on a global undefined (#427)
  • \\n
  • 1072d57 [readme] remove travis badge; add github actions/codecov badges; update URLs
  • \\n
  • 12ac1c4 [meta] fix README.md (#399)
  • \\n
  • 0338716 [actions] backport actions from main
  • \\n
  • 5639c20 Clean up license text so it’s properly detected as BSD-3-Clause
  • \\n
  • 51b8a0b add FUNDING.yml
  • \\n
  • 45f6759 [Fix] fix for an impossible situation: when the formatter is called with a no...
  • \\n
  • f814a7f [Dev Deps] backport from main
  • \\n
  • Additional commits viewable in compare view
  • \\n
\\n
\\n
\\n\\n\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=qs&package-manager=npm_and_yarn&previous-version=6.5.2&new-version=6.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\n\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\n\\n[//]: # (dependabot-automerge-start)\\n[//]: # (dependabot-automerge-end)\\n\\n---\\n\\n
\\nDependabot commands and options\\n
\\n\\nYou can trigger Dependabot actions by commenting on this PR:\\n- `@dependabot rebase` will rebase this PR\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\n- `@dependabot merge` will merge this PR after your CI passes on it\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\n- `@dependabot reopen` will reopen this PR if it is closed\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\n\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/AbdElrahmanMuhammedNasr/WuzuufMasr/network/alerts).\\n\\n
',\n", + " 'pull_request.user.login': 'dependabot[bot]',\n", + " 'pull_request.user.id': 49699333,\n", + " 'pull_request.author_association': 'NONE',\n", + " 'pull_request.created_at': '2022-12-10T03:27:08Z',\n", + " 'pull_request.updated_at': '2022-12-10T03:27:08Z',\n", + " 'pull_request.closed_at': None,\n", + " 'pull_request.merged_at': None,\n", + " 'pull_request.merge_commit_sha': None,\n", + " 'pull_request.locked': False,\n", + " 'pull_request.assignee.login': None,\n", + " 'pull_request.assignee.id': None,\n", + " 'pull_request.assignee.type': None,\n", + " 'pull_request.assignee.site_admin': None,\n", + " 'pull_request.milestone.id': None,\n", + " 'pull_request.milestone.number': None,\n", + " 'pull_request.milestone.title': None,\n", + " 'pull_request.milestone.description': None,\n", + " 'pull_request.milestone.creator.login': None,\n", + " 'pull_request.milestone.creator.id': None,\n", + " 'pull_request.milestone.creator.type': None,\n", + " 'pull_request.milestone.creator.site_admin': None,\n", + " 'pull_request.milestone.open_issues': None,\n", + " 'pull_request.milestone.closed_issues': None,\n", + " 'pull_request.milestone.state': None,\n", + " 'pull_request.milestone.created_at': None,\n", + " 'pull_request.milestone.updated_at': None,\n", + " 'pull_request.milestone.due_on': None,\n", + " 'pull_request.milestone.closed_at': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.mergeable': None,\n", + " 'pull_request.mergeable_state': 'unknown',\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.id': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.merged_by.site_admin': None,\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.review_comments': 0,\n", + " 'pull_request.commits': 1,\n", + " 'pull_request.additions': 3,\n", + " 'pull_request.deletions': 3,\n", + " 'pull_request.changed_files': 1,\n", + " 'pull_request.label.id': None,\n", + " 'pull_request.label.name': None,\n", + " 'pull_request.label.color': None,\n", + " 'pull_request.label.default': None,\n", + " 'pull_request.head.label': 'AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3',\n", + " 'pull_request.head.ref': 'dependabot/npm_and_yarn/qs-6.5.3',\n", + " 'pull_request.head.sha': '94469b10a02fa77e95bb22aaa0fbcc16ef03edfd',\n", + " 'pull_request.head.user.login': 'AbdElrahmanMuhammedNasr',\n", + " 'pull_request.head.user.type': 'User',\n", + " 'pull_request.head.repo.name': 'WuzuufMasr',\n", + " 'pull_request.head.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n", + " 'pull_request.head.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n", + " 'pull_request.head.repo.owner.type': 'User',\n", + " 'pull_request.head.repo.private': False,\n", + " 'pull_request.head.repo.homepage': None,\n", + " 'pull_request.head.repo.description': None,\n", + " 'pull_request.head.repo.fork': False,\n", + " 'pull_request.head.repo.created_at': '2019-09-23T19:17:51Z',\n", + " 'pull_request.head.repo.updated_at': '2019-10-11T19:57:45Z',\n", + " 'pull_request.head.repo.pushed_at': '2022-12-10T03:27:07Z',\n", + " 'pull_request.head.repo.size': 1345,\n", + " 'pull_request.head.repo.stargazers_count': 0,\n", + " 'pull_request.head.repo.watchers_count': 0,\n", + " 'pull_request.head.repo.language': 'TypeScript',\n", + " 'pull_request.head.repo.has_issues': True,\n", + " 'pull_request.head.repo.has_projects': True,\n", + " 'pull_request.head.repo.has_downloads': True,\n", + " 'pull_request.head.repo.has_wiki': True,\n", + " 'pull_request.head.repo.has_pages': False,\n", + " 'pull_request.head.repo.forks_count': 0,\n", + " 'pull_request.head.repo.archived': False,\n", + " 'pull_request.head.repo.disabled': False,\n", + " 'pull_request.head.repo.open_issues_count': 24,\n", + " 'pull_request.head.repo.forks': 0,\n", + " 'pull_request.head.repo.open_issues': 24,\n", + " 'pull_request.head.repo.watchers': 0,\n", + " 'pull_request.head.repo.default_branch': 'master',\n", + " 'pull_request.head.repo.license.key': None,\n", + " 'pull_request.head.repo.license.spdx_id': None,\n", + " 'pull_request.head.repo.license.name': None,\n", + " 'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n", + " 'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n", + " 'pull_request.base.user.type': 'User',\n", + " 'pull_request.base.repo.name': 'WuzuufMasr',\n", + " 'pull_request.base.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n", + " 'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n", + " 'pull_request.base.repo.owner.type': 'User',\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.description': None,\n", + " 'pull_request.base.repo.fork': False,\n", + " 'pull_request.base.repo.created_at': '2019-09-23T19:17:51Z',\n", + " 'pull_request.base.repo.updated_at': '2019-10-11T19:57:45Z',\n", + " 'pull_request.base.repo.pushed_at': '2022-12-10T03:27:07Z',\n", + " 'pull_request.base.repo.size': 1345,\n", + " 'pull_request.base.repo.stargazers_count': 0,\n", + " 'pull_request.base.repo.watchers_count': 0,\n", + " 'pull_request.base.repo.language': 'TypeScript',\n", + " 'pull_request.base.repo.has_issues': True,\n", + " 'pull_request.base.repo.has_projects': True,\n", + " 'pull_request.base.repo.has_downloads': True,\n", + " 'pull_request.base.repo.has_wiki': True,\n", + " 'pull_request.base.repo.has_pages': False,\n", + " 'pull_request.base.repo.forks_count': 0,\n", + " 'pull_request.base.repo.archived': False,\n", + " 'pull_request.base.repo.disabled': False,\n", + " 'pull_request.base.repo.open_issues_count': 24,\n", + " 'pull_request.base.repo.forks': 0,\n", + " 'pull_request.base.repo.open_issues': 24,\n", + " 'pull_request.base.repo.watchers': 0,\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.license.key': None,\n", + " 'pull_request.base.repo.license.spdx_id': None,\n", + " 'pull_request.base.repo.license.name': None,\n", + " 'pull_request.guid': 'AbdElrahmanMuhammedNasr/WuzuufMasr/pull/35'}]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "res = json.loads(small_ds[0]['pull_request.events'])\n", + "res" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'action': 'opened',\n", + " 'author': 'hillc-usgs',\n", + " 'comment': None,\n", + " 'comment_id': None,\n", + " 'datetime': '2021-06-24T17:23:03Z',\n", + " 'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", + " 'title': 'pygeoapi_plugins refit',\n", + " 'type': 'issue'},\n", + " {'action': 'created',\n", + " 'author': 'rmcd-mscb',\n", + " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \",\n", + " 'comment_id': 868826717.0,\n", + " 'datetime': '2021-06-25 20:51:35+00:00',\n", + " 'description': None,\n", + " 'title': None,\n", + " 'type': 'comment'}]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues = issues[0][\"events\"]\n", + "issues" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10\n" + ] + } + ], + "source": [ + "for i in range(3, 20):\n", + " row = small_ds[i]\n", + " events = load_json(row[\"pull_request.events\"])\n", + " reviews = load_json(row[\"pull_request.code_review_events\"])\n", + " issues = load_json(row[\"pull_request.issue_events\"])\n", + " if reviews:\n", + " print(i)\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len events 2, len reviews 1, len issues 1\n" + ] + } + ], + "source": [ + "row = small_ds[10]\n", + "events = load_json(row[\"pull_request.events\"])\n", + "reviews = load_json(row[\"pull_request.code_review_events\"])\n", + "issues = load_json(row[\"pull_request.issue_events\"])\n", + "print(f\"len events {len(events)}, len reviews {len(reviews)}, len issues {len(issues)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "events = load_json(row[\"pull_request.events\"])\n", + "reviews = load_json(row[\"pull_request.code_review_events\"])\n", + "issues = load_json(row[\"pull_request.issue_events\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "L = events + reviews + issues" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "events = load_json(row[\"pull_request.events\"])\n", + "reviews = load_json(row[\"pull_request.code_review_events\"])\n", + "issues = load_json(row[\"pull_request.issue_events\"])\n", + "assert len(issues) == 1\n", + "issues_events = issues[0][\"events\"]\n", + "# for each events in each category group all events sorted by \"created_at\" in one list\n", + "for e in issues_events:\n", + " e[\"created_at\"] = parse(e[\"datetime\"])\n", + " del e[\"datetime\"]\n", + "events = [update_datetime(e) for e in events]\n", + "reviews = [update_datetime(e) for e in reviews]\n", + "all_events = sorted(\n", + " events + reviews + issues_events,\n", + " key=lambda x: x[\"created_at\"]\n", + ")\n", + "\n", + "pr_info = {k: events[0][k] for k in pull_request_info_cols}\n", + "head_info = {k: events[0][k] for k in head_info_cols}\n", + "base_info = {k: events[0][k] for k in base_info_cols}\n", + "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", + "comments = [{\"type\": e[\"type\"],\n", + " \"action\": e[\"action\"],\n", + " \"created_at\": e[\"created_at\"],\n", + " \"review_comments\": get_review_info(e),\n", + " \"issues_comments\": get_issue_info(e)} for e in all_events]\n", + "new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['pull_request_info', 'head_repo_info', 'base_repo_info', 'events'])" + ] + }, + "execution_count": 131, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_row.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "**GitHub Repo**: ACWI-SSWD/nldi_flowtools, PR Number: 4, ID: 677298606\n", + "**GitHub Link**: https://github.com/ACWI-SSWD/nldi_flowtools/pull/4\n", + "----------------------------------------------------------------------------------------------------\n", + "Type: issue, action: opened, created_at: 2021-06-24 17:23:03+00:00\n", + "Author hillc-usgs did opened:\n", + "None\n", + "----------------------------------------------------------------------------------------------------\n", + "Type: PullRequestEvent, action: opened, created_at: 2021-06-24 17:23:04+00:00\n", + "Author hillc-usgs with association None did opened\n", + "----------------------------------------------------------------------------------------------------\n", + "Type: PullRequestReviewEvent, action: created, created_at: 2021-06-25 20:50:41+00:00\n", + "Author rmcd-mscb with association NONE did created\n", + "Review:\n", + "Thanks Cliff - Anders has been out this week, to keep things moving I'll merge the request and leave the branch for him to view when he gets back. \n", + "----------------------------------------------------------------------------------------------------\n", + "Type: PullRequestEvent, action: closed, created_at: 2021-06-25 20:50:54+00:00\n", + "Author rmcd-mscb with association None did closed\n", + "----------------------------------------------------------------------------------------------------\n", + "Type: comment, action: created, created_at: 2021-06-25 20:51:35+00:00\n", + "Author rmcd-mscb did created:\n", + "@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \n" + ] + } + ], + "source": [ + "pr_info = new_row[\"pull_request_info\"]\n", + "res = f\"**GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}\"\n", + "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", + "res += f\"\\n**GitHub Link**: {gh_link}\"\n", + "print(res)\n", + "for i in range(len(new_row[\"events\"])):\n", + " e = new_row[\"events\"][i]\n", + " print(\"-\" * 100)\n", + " print(f\"Type: {e['type']}, action: {e['action']}, created_at: {e['created_at']}\")\n", + " action = e['action']\n", + "\n", + " if e['type'] in [\"issue\", \"comment\"]:\n", + " e = e[\"issues_comments\"]\n", + " print(f\"Author {e['author']} did {e['action']}:\\n{e['comment']}\")\n", + "\n", + " elif e['type'] in [\"PullRequestEvent\", \"PullRequestReviewCommentEvent\", \"PullRequestReviewEvent\"]:\n", + " reviews = e[\"review_comments\"]\n", + " print(f\"Author {reviews['actor.login']} with association {reviews['review.author_association']} did {action}\")\n", + " if reviews['review.body']:\n", + " print(f\"Review:\\n{reviews['review.body']}\")\n", + " if reviews['comment.body']:\n", + " print(f\"Comment:\\n{reviews['comment.body']}\")\n", + " if reviews['comment.diff_hunk']:\n", + " print(f\"Diff hunk:\\n{reviews['diff_hunk']}\")\n", + " print(f\"File path {reviews['path']}\")\n", + " else:\n", + " print(\"OTHER\")\n", + " print(e[\"type\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'pull_request.base.label': 'ACWI-SSWD:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n", + " 'pull_request.base.user.login': 'ACWI-SSWD',\n", + " 'pull_request.base.user.type': 'Organization',\n", + " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n", + " 'pull_request.base.repo.owner.type': 'Organization',\n", + " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.description': None,\n", + " 'pull_request.base.repo.language': 'Python',\n", + " 'pull_request.base.repo.watchers_count': 3,\n", + " 'pull_request.base.repo.open_issues_count': 1,\n", + " 'pull_request.base.repo.forks_count': 0,\n", + " 'pull_request.base.repo.name': 'nldi_flowtools',\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.stargazers_count': 3,\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.review_comments': 0,\n", + " 'pull_request.label.name': None}" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pr_info" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'type': 'PullRequestEvent',\n", + " 'action': 'opened',\n", + " 'actor.login': 'hillc-usgs',\n", + " 'actor.id': 84474574,\n", + " 'user.login': None,\n", + " 'user.id': None,\n", + " 'user.type': None,\n", + " 'repo.name': 'ACWI-SSWD/nldi_flowtools',\n", + " 'repo.id': 365244721,\n", + " 'public': True,\n", + " 'created_at': datetime.datetime(2021, 6, 24, 17, 23, 4, tzinfo=tzlocal()),\n", + " 'org.id': 17301770,\n", + " 'org.login': 'ACWI-SSWD',\n", + " 'pull_request.id': 677298606,\n", + " 'pull_request.number': 4,\n", + " 'pull_request.state': 'open',\n", + " 'pull_request.title': 'pygeoapi_plugins refit',\n", + " 'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", + " 'pull_request.user.login': 'hillc-usgs',\n", + " 'pull_request.user.id': 84474574,\n", + " 'pull_request.author_association': 'NONE',\n", + " 'pull_request.created_at': '2021-06-24T17:23:03Z',\n", + " 'pull_request.updated_at': '2021-06-24T17:23:03Z',\n", + " 'pull_request.closed_at': None,\n", + " 'pull_request.merged_at': None,\n", + " 'pull_request.merge_commit_sha': None,\n", + " 'pull_request.locked': False,\n", + " 'pull_request.assignee.login': None,\n", + " 'pull_request.assignee.id': None,\n", + " 'pull_request.assignee.type': None,\n", + " 'pull_request.assignee.site_admin': None,\n", + " 'pull_request.milestone.id': None,\n", + " 'pull_request.milestone.number': None,\n", + " 'pull_request.milestone.title': None,\n", + " 'pull_request.milestone.description': None,\n", + " 'pull_request.milestone.creator.login': None,\n", + " 'pull_request.milestone.creator.id': None,\n", + " 'pull_request.milestone.creator.type': None,\n", + " 'pull_request.milestone.creator.site_admin': None,\n", + " 'pull_request.milestone.open_issues': None,\n", + " 'pull_request.milestone.closed_issues': None,\n", + " 'pull_request.milestone.state': None,\n", + " 'pull_request.milestone.created_at': None,\n", + " 'pull_request.milestone.updated_at': None,\n", + " 'pull_request.milestone.due_on': None,\n", + " 'pull_request.milestone.closed_at': None,\n", + " 'pull_request.merged': False,\n", + " 'pull_request.mergeable': None,\n", + " 'pull_request.mergeable_state': 'unknown',\n", + " 'pull_request.merged_by.login': None,\n", + " 'pull_request.merged_by.id': None,\n", + " 'pull_request.merged_by.type': None,\n", + " 'pull_request.merged_by.site_admin': None,\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.review_comments': 0,\n", + " 'pull_request.commits': 5,\n", + " 'pull_request.additions': 321,\n", + " 'pull_request.deletions': 25,\n", + " 'pull_request.changed_files': 5,\n", + " 'pull_request.label.id': None,\n", + " 'pull_request.label.name': None,\n", + " 'pull_request.label.color': None,\n", + " 'pull_request.label.default': None,\n", + " 'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n", + " 'pull_request.head.ref': 'pygeoapi_plugins-refit',\n", + " 'pull_request.head.sha': '9143699913269aff0814979d932957efeb002eb1',\n", + " 'pull_request.head.user.login': 'ACWI-SSWD',\n", + " 'pull_request.head.user.type': 'Organization',\n", + " 'pull_request.head.repo.name': 'nldi_flowtools',\n", + " 'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", + " 'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n", + " 'pull_request.head.repo.owner.type': 'Organization',\n", + " 'pull_request.head.repo.private': False,\n", + " 'pull_request.head.repo.homepage': None,\n", + " 'pull_request.head.repo.description': None,\n", + " 'pull_request.head.repo.fork': False,\n", + " 'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n", + " 'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n", + " 'pull_request.head.repo.pushed_at': '2021-06-24T15:15:30Z',\n", + " 'pull_request.head.repo.size': 4309,\n", + " 'pull_request.head.repo.stargazers_count': 3,\n", + " 'pull_request.head.repo.watchers_count': 3,\n", + " 'pull_request.head.repo.language': 'Python',\n", + " 'pull_request.head.repo.has_issues': True,\n", + " 'pull_request.head.repo.has_projects': True,\n", + " 'pull_request.head.repo.has_downloads': True,\n", + " 'pull_request.head.repo.has_wiki': True,\n", + " 'pull_request.head.repo.has_pages': False,\n", + " 'pull_request.head.repo.forks_count': 0,\n", + " 'pull_request.head.repo.archived': False,\n", + " 'pull_request.head.repo.disabled': False,\n", + " 'pull_request.head.repo.open_issues_count': 1,\n", + " 'pull_request.head.repo.forks': 0,\n", + " 'pull_request.head.repo.open_issues': 1,\n", + " 'pull_request.head.repo.watchers': 3,\n", + " 'pull_request.head.repo.default_branch': 'master',\n", + " 'pull_request.head.repo.license.key': 'bsd-3-clause',\n", + " 'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n", + " 'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", + " 'pull_request.base.label': 'ACWI-SSWD:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n", + " 'pull_request.base.user.login': 'ACWI-SSWD',\n", + " 'pull_request.base.user.type': 'Organization',\n", + " 'pull_request.base.repo.name': 'nldi_flowtools',\n", + " 'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", + " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n", + " 'pull_request.base.repo.owner.type': 'Organization',\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.description': None,\n", + " 'pull_request.base.repo.fork': False,\n", + " 'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n", + " 'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n", + " 'pull_request.base.repo.pushed_at': '2021-06-24T15:15:30Z',\n", + " 'pull_request.base.repo.size': 4309,\n", + " 'pull_request.base.repo.stargazers_count': 3,\n", + " 'pull_request.base.repo.watchers_count': 3,\n", + " 'pull_request.base.repo.language': 'Python',\n", + " 'pull_request.base.repo.has_issues': True,\n", + " 'pull_request.base.repo.has_projects': True,\n", + " 'pull_request.base.repo.has_downloads': True,\n", + " 'pull_request.base.repo.has_wiki': True,\n", + " 'pull_request.base.repo.has_pages': False,\n", + " 'pull_request.base.repo.forks_count': 0,\n", + " 'pull_request.base.repo.archived': False,\n", + " 'pull_request.base.repo.disabled': False,\n", + " 'pull_request.base.repo.open_issues_count': 1,\n", + " 'pull_request.base.repo.forks': 0,\n", + " 'pull_request.base.repo.open_issues': 1,\n", + " 'pull_request.base.repo.watchers': 3,\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.license.key': 'bsd-3-clause',\n", + " 'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n", + " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", + " 'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'},\n", + " {'type': 'PullRequestEvent',\n", + " 'action': 'closed',\n", + " 'actor.login': 'rmcd-mscb',\n", + " 'actor.id': 11791580,\n", + " 'user.login': None,\n", + " 'user.id': None,\n", + " 'user.type': None,\n", + " 'repo.name': 'ACWI-SSWD/nldi_flowtools',\n", + " 'repo.id': 365244721,\n", + " 'public': True,\n", + " 'created_at': datetime.datetime(2021, 6, 25, 20, 50, 54, tzinfo=tzlocal()),\n", + " 'org.id': 17301770,\n", + " 'org.login': 'ACWI-SSWD',\n", + " 'pull_request.id': 677298606,\n", + " 'pull_request.number': 4,\n", + " 'pull_request.state': 'closed',\n", + " 'pull_request.title': 'pygeoapi_plugins refit',\n", + " 'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", + " 'pull_request.user.login': 'hillc-usgs',\n", + " 'pull_request.user.id': 84474574,\n", + " 'pull_request.author_association': 'NONE',\n", + " 'pull_request.created_at': '2021-06-24T17:23:03Z',\n", + " 'pull_request.updated_at': '2021-06-25T20:50:53Z',\n", + " 'pull_request.closed_at': '2021-06-25T20:50:53Z',\n", + " 'pull_request.merged_at': '2021-06-25T20:50:53Z',\n", + " 'pull_request.merge_commit_sha': 'c0a8e850c8e627b0474b9059582e7a61e5fd3699',\n", + " 'pull_request.locked': False,\n", + " 'pull_request.assignee.login': None,\n", + " 'pull_request.assignee.id': None,\n", + " 'pull_request.assignee.type': None,\n", + " 'pull_request.assignee.site_admin': None,\n", + " 'pull_request.milestone.id': None,\n", + " 'pull_request.milestone.number': None,\n", + " 'pull_request.milestone.title': None,\n", + " 'pull_request.milestone.description': None,\n", + " 'pull_request.milestone.creator.login': None,\n", + " 'pull_request.milestone.creator.id': None,\n", + " 'pull_request.milestone.creator.type': None,\n", + " 'pull_request.milestone.creator.site_admin': None,\n", + " 'pull_request.milestone.open_issues': None,\n", + " 'pull_request.milestone.closed_issues': None,\n", + " 'pull_request.milestone.state': None,\n", + " 'pull_request.milestone.created_at': None,\n", + " 'pull_request.milestone.updated_at': None,\n", + " 'pull_request.milestone.due_on': None,\n", + " 'pull_request.milestone.closed_at': None,\n", + " 'pull_request.merged': True,\n", + " 'pull_request.mergeable': None,\n", + " 'pull_request.mergeable_state': 'unknown',\n", + " 'pull_request.merged_by.login': 'rmcd-mscb',\n", + " 'pull_request.merged_by.id': 11791580,\n", + " 'pull_request.merged_by.type': 'User',\n", + " 'pull_request.merged_by.site_admin': False,\n", + " 'pull_request.comments': 0,\n", + " 'pull_request.review_comments': 0,\n", + " 'pull_request.commits': 7,\n", + " 'pull_request.additions': 292,\n", + " 'pull_request.deletions': 1,\n", + " 'pull_request.changed_files': 5,\n", + " 'pull_request.label.id': None,\n", + " 'pull_request.label.name': None,\n", + " 'pull_request.label.color': None,\n", + " 'pull_request.label.default': None,\n", + " 'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n", + " 'pull_request.head.ref': 'pygeoapi_plugins-refit',\n", + " 'pull_request.head.sha': '3e3fe0dfdfce5fe24c25231c3207c2d292b31165',\n", + " 'pull_request.head.user.login': 'ACWI-SSWD',\n", + " 'pull_request.head.user.type': 'Organization',\n", + " 'pull_request.head.repo.name': 'nldi_flowtools',\n", + " 'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", + " 'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n", + " 'pull_request.head.repo.owner.type': 'Organization',\n", + " 'pull_request.head.repo.private': False,\n", + " 'pull_request.head.repo.homepage': None,\n", + " 'pull_request.head.repo.description': None,\n", + " 'pull_request.head.repo.fork': False,\n", + " 'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n", + " 'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n", + " 'pull_request.head.repo.pushed_at': '2021-06-25T20:50:53Z',\n", + " 'pull_request.head.repo.size': 4310,\n", + " 'pull_request.head.repo.stargazers_count': 3,\n", + " 'pull_request.head.repo.watchers_count': 3,\n", + " 'pull_request.head.repo.language': 'Python',\n", + " 'pull_request.head.repo.has_issues': True,\n", + " 'pull_request.head.repo.has_projects': True,\n", + " 'pull_request.head.repo.has_downloads': True,\n", + " 'pull_request.head.repo.has_wiki': True,\n", + " 'pull_request.head.repo.has_pages': False,\n", + " 'pull_request.head.repo.forks_count': 0,\n", + " 'pull_request.head.repo.archived': False,\n", + " 'pull_request.head.repo.disabled': False,\n", + " 'pull_request.head.repo.open_issues_count': 0,\n", + " 'pull_request.head.repo.forks': 0,\n", + " 'pull_request.head.repo.open_issues': 0,\n", + " 'pull_request.head.repo.watchers': 3,\n", + " 'pull_request.head.repo.default_branch': 'master',\n", + " 'pull_request.head.repo.license.key': 'bsd-3-clause',\n", + " 'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n", + " 'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", + " 'pull_request.base.label': 'ACWI-SSWD:master',\n", + " 'pull_request.base.ref': 'master',\n", + " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n", + " 'pull_request.base.user.login': 'ACWI-SSWD',\n", + " 'pull_request.base.user.type': 'Organization',\n", + " 'pull_request.base.repo.name': 'nldi_flowtools',\n", + " 'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", + " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n", + " 'pull_request.base.repo.owner.type': 'Organization',\n", + " 'pull_request.base.repo.private': False,\n", + " 'pull_request.base.repo.homepage': None,\n", + " 'pull_request.base.repo.description': None,\n", + " 'pull_request.base.repo.fork': False,\n", + " 'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n", + " 'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n", + " 'pull_request.base.repo.pushed_at': '2021-06-25T20:50:53Z',\n", + " 'pull_request.base.repo.size': 4310,\n", + " 'pull_request.base.repo.stargazers_count': 3,\n", + " 'pull_request.base.repo.watchers_count': 3,\n", + " 'pull_request.base.repo.language': 'Python',\n", + " 'pull_request.base.repo.has_issues': True,\n", + " 'pull_request.base.repo.has_projects': True,\n", + " 'pull_request.base.repo.has_downloads': True,\n", + " 'pull_request.base.repo.has_wiki': True,\n", + " 'pull_request.base.repo.has_pages': False,\n", + " 'pull_request.base.repo.forks_count': 0,\n", + " 'pull_request.base.repo.archived': False,\n", + " 'pull_request.base.repo.disabled': False,\n", + " 'pull_request.base.repo.open_issues_count': 0,\n", + " 'pull_request.base.repo.forks': 0,\n", + " 'pull_request.base.repo.open_issues': 0,\n", + " 'pull_request.base.repo.watchers': 3,\n", + " 'pull_request.base.repo.default_branch': 'master',\n", + " 'pull_request.base.repo.license.key': 'bsd-3-clause',\n", + " 'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n", + " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", + " 'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'}]" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "events" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'2021-06-24T17:23:03Z'" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "events[0][\"created_at\"]\n", + "issues[0][\"events\"][0][\"datetime\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'action': 'created',\n", + " 'author': 'rmcd-mscb',\n", + " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \",\n", + " 'comment_id': 868826717.0,\n", + " 'datetime': '2021-06-25 20:51:35+00:00',\n", + " 'description': None,\n", + " 'title': None,\n", + " 'type': 'comment'}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues[0][\"events\"][1]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues[0].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_events = sorted(\n", + " events + reviews + issues,\n", + " key=lambda x: x[\"created_at\"]\n", + ")\n", + "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n", + "head_info = {k: all_events[-1][k] for k in head_info_cols}\n", + "base_info = {k: all_events[-1][k] for k in base_info_cols}\n", + "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", + "comments = [{\"comments\": e[\"pull_request.comments\"],\n", + " \"review_comments\": e[\"pull_request.review_comments\"],\n", + " \"extra_review_info\": get_extra_review_info(e)} for e in all_events]\n", + "new_row = {\"pr_info\": pr_info, \"head_info\": head_info, \"base_info\": base_info, \"comments\": comments}" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'created_at'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n", + "Cell \u001b[0;32mIn[36], line 106\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m 104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[0;32m--> 106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39;49m(\n\u001b[1;32m 107\u001b[0m events \u001b[39m+\u001b[39;49m reviews \u001b[39m+\u001b[39;49m issues,\n\u001b[1;32m 108\u001b[0m key\u001b[39m=\u001b[39;49m\u001b[39mlambda\u001b[39;49;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 109\u001b[0m )\n\u001b[1;32m 110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m 111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n", + "Cell \u001b[0;32mIn[36], line 108\u001b[0m, in \u001b[0;36mmerge_events..\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m 104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[1;32m 106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39m(\n\u001b[1;32m 107\u001b[0m events \u001b[39m+\u001b[39m reviews \u001b[39m+\u001b[39m issues,\n\u001b[0;32m--> 108\u001b[0m key\u001b[39m=\u001b[39m\u001b[39mlambda\u001b[39;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 109\u001b[0m )\n\u001b[1;32m 110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m 111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n", + "\u001b[0;31mKeyError\u001b[0m: 'created_at'" + ] + } + ], + "source": [ + "new_row = merge_events(row)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'repo': 'ACWI-SSWD/nldi_flowtools',\n", + " 'org': 'ACWI-SSWD',\n", + " 'issue_id': 929448726,\n", + " 'issue_number': 4,\n", + " 'pull_request': {'number': 4.0,\n", + " 'repo': 'nldi_flowtools',\n", + " 'user_login': 'ACWI-SSWD'},\n", + " 'events': [{'action': 'opened',\n", + " 'author': 'hillc-usgs',\n", + " 'comment': None,\n", + " 'comment_id': None,\n", + " 'datetime': '2021-06-24T17:23:03Z',\n", + " 'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", + " 'title': 'pygeoapi_plugins refit',\n", + " 'type': 'issue'},\n", + " {'action': 'created',\n", + " 'author': 'rmcd-mscb',\n", + " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \",\n", + " 'comment_id': 868826717.0,\n", + " 'datetime': '2021-06-25 20:51:35+00:00',\n", + " 'description': None,\n", + " 'title': None,\n", + " 'type': 'comment'}]}" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "issues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# for each events in each category group all events sorted by \"created_at\" in one list\n", + "# then merge all three lists\n", + "# then sort by \"created_at\"\n", + "all_events = sorted(\n", + " events + reviews + issues,\n", + " key=lambda x: x[\"created_at\"]\n", + ")\n", + "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n", + "head_info = {k: all_events[-1][k] for k in head_info_cols}\n", + "base_info = {k: all_events[-1][k] for k in base_info_cols}\n", + "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", + "comments = [{\"comments\": e[\"pull_request.comments\"],\n", + " \"review_comments\": e[\"pull_request.review_comments\"],\n", + " \"extra_review_info\": get_extra_review_info(e)} for e in all_events]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pull_request_info_cols = [\n", + " \"repo.name\",\n", + " \"repo.id\",\n", + " \"org.id\",\n", + " \"public\",\n", + " \"pull_request.id\",\n", + " \"pull_request.guid\",\n", + " \"pull_request.number\",\n", + " \"pull_request.title\",\n", + " \"pull_request.body\",\n", + " \"pull_request.state\",\n", + " \"pull_request.user.login\",\n", + " \"pull_request.user.id\",\n", + " \"pull_request.created_at\",\n", + " \"pull_request.closed_at\",\n", + " \"pull_request.merged_at\",\n", + " \"pull_request.merged_by.login\",\n", + " \"pull_request.milestone.title\",\n", + " \"pull_request.milestone.description\",\n", + " \"pull_request.milestone.number\",\n", + " # commits\n", + " 'pull_request.commits',\n", + " 'pull_request.additions',\n", + " 'pull_request.deletions',\n", + " # changed files\n", + " 'pull_request.changed_files',\n", + "]\n", + "\n", + "comments = [\n", + " 'pull_request.comments',\n", + " 'pull_request.review_comments',\n", + " # for PR event\n", + " 'pull_request.label.name',\n", + " # review events only\n", + " 'review.state',\n", + " 'review.id', \n", + " 'review.body', \n", + " 'review.commit_id', \n", + " 'review.submitted_at', \n", + " 'review.author_association', '\n", + "]\n", + "\n", + "head_info_cols = [\n", + " \"pull_request.head.label\",\n", + " \"pull_request.head.ref\",\n", + " \"pull_request.head.user.login\",\n", + " \"pull_request.head.user.type\",\n", + " \"pull_request.head.repo.owner.login\",\n", + " \"pull_request.head.repo.owner.type\",\n", + " \"pull_request.head.repo.license.name\",\n", + " \"pull_request.head.sha\",\n", + " 'pull_request.head.repo.name',\n", + " 'pull_request.head.repo.owner.login',\n", + " 'pull_request.head.repo.homepage',\n", + " 'pull_request.head.repo.description',\n", + " 'pull_request.head.repo.language',\n", + " 'pull_request.head.repo.stargazers_count',\n", + " 'pull_request.head.repo.license.name',\n", + " 'pull_request.head.repo.default_branch',\n", + " 'pull_request.head.repo.private'\n", + "]\n", + "base_info_cols = [\n", + " \"pull_request.base.label\",\n", + " \"pull_request.base.ref\",\n", + " \"pull_request.base.sha\",\n", + " \"pull_request.base.user.login\",\n", + " \"pull_request.base.user.type\",\n", + " \"pull_request.base.repo.owner.login\",\n", + " \"pull_request.base.repo.owner.type\",\n", + " \"pull_request.base.repo.license.name\",\n", + " \"pull_request.base.repo.default_branch\",\n", + " \"pull_request.base.repo.description\",\n", + " \"pull_request.base.repo.language\",\n", + " \"pull_request.base.repo.watchers_count\",\n", + " \"pull_request.base.repo.open_issues_count\",\n", + " \"pull_request.base.repo.forks_count\",\n", + " 'pull_request.base.repo.name',\n", + " 'pull_request.base.repo.owner.login',\n", + " 'pull_request.base.repo.homepage',\n", + " 'pull_request.base.repo.description',\n", + " 'pull_request.base.repo.language',\n", + " 'pull_request.base.repo.stargazers_count',\n", + " 'pull_request.base.repo.private',\n", + "]\n", + "# drop \"repo.name\", \"repo.id\", \"public\" so they are not duplicated and keep relevant columns that might change\n", + "event_cols = [\n", + " col\n", + " for col in df.columns\n", + " if (not col.startswith(\"pull_request.\"))\n", + " and col not in [\"repo.name\", \"repo.id\", \"public\"]\n", + "] + [\n", + " \"pull_request.head.label\",\n", + " \"pull_request.head.ref\",\n", + " \"pull_request.head.sha\",\n", + " \"pull_request.title\",\n", + "]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.11.0 ('.venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "0cc3054246fa39b40b564a97820c10836c9fb6acdf94e9196ea3a787cac26526" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From df0dc1b8ac081e3ca626bdf57bcb4954fafaaac9 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Thu, 21 Sep 2023 13:57:45 +0200 Subject: [PATCH 5/5] update --- .../pull-requests/reconstruct_prs.ipynb | 4750 +---------------- 1 file changed, 28 insertions(+), 4722 deletions(-) diff --git a/data_analysis/pull-requests/reconstruct_prs.ipynb b/data_analysis/pull-requests/reconstruct_prs.ipynb index 9bc724c..db6a077 100644 --- a/data_analysis/pull-requests/reconstruct_prs.ipynb +++ b/data_analysis/pull-requests/reconstruct_prs.ipynb @@ -29,28 +29,6 @@ "pip install python-dateutil" ] }, - { - "cell_type": "code", - "execution_count": 329, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading readme: 100%|██████████| 8.02k/8.02k [00:00<00:00, 1.52MB/s]\n" - ] - } - ], - "source": [ - "import json\n", - "import pandas as pd\n", - "from dateutil.parser import parse\n", - "from datasets import load_dataset, Dataset\n", - "\n", - "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)" - ] - }, { "cell_type": "code", "execution_count": 330, @@ -69,17 +47,8 @@ "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n", "\n", "# 10k subset of random samples from ds\n", - "fianl_ds = list(ds.take(size))\n", - "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))" - ] - }, - { - "cell_type": "code", - "execution_count": 332, - "metadata": {}, - "outputs": [], - "source": [ - "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))" + "ds = list(ds.take(size))\n", + "ds = Dataset.from_pandas(pd.DataFrame(data=ds))" ] }, { @@ -107,353 +76,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import pandas as pd\n", - "from dateutil.parser import parse\n", - "from datasets import load_dataset, Dataset\n", - "\n", - "small_ds = load_dataset(\"bigcode-data/the-stack-gh-pull-requests\", use_auth_token=True, split=\"train\", streaming=True)\n", - "\n", - "size = 500_000\n", - "\n", - "ds = small_ds.shuffle(seed=0, buffer_size=1_000_000)\n", - "\n", - "# 10k subset of random samples from ds\n", - "fianl_ds = list(ds.take(size))\n", - "ds = Dataset.from_pandas(pd.DataFrame(data=fianl_ds))" - ] - }, - { - "cell_type": "code", - "execution_count": 335, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'__index_level_0__': 46164,\n", - " 'bucket': None,\n", - " 'pull_request.code_review_events': None,\n", - " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n", - " '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n", - " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", - " 'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n", - " '158894695, \"public\": true, \"created_at\": '\n", - " '\"2020-11-23T05:58:40Z\", \"org.id\": null, \"org.login\": '\n", - " 'null, \"pull_request.id\": 525472638, '\n", - " '\"pull_request.number\": 2460, \"pull_request.state\": '\n", - " '\"open\", \"pull_request.title\": \"[pull] master from '\n", - " 'kubernetes:master\", \"pull_request.body\": \"See Commits '\n", - " 'and Changes for more details.\\\\n\\\\n-----\\\\nCreated by '\n", - " '[ '\n", - " '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n", - " 'you help keep this open source service alive? '\n", - " '**[\\\\ud83d\\\\udc96 Please sponsor : '\n", - " ')](https://prod.download/pull-pr-sponsor)**_\", '\n", - " '\"pull_request.user.login\": \"pull[bot]\", '\n", - " '\"pull_request.user.id\": 39814207, '\n", - " '\"pull_request.author_association\": \"NONE\", '\n", - " '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n", - " '\"pull_request.updated_at\": \"2020-11-23T05:58:39Z\", '\n", - " '\"pull_request.closed_at\": null, '\n", - " '\"pull_request.merged_at\": null, '\n", - " '\"pull_request.merge_commit_sha\": null, '\n", - " '\"pull_request.locked\": false, '\n", - " '\"pull_request.assignee.login\": null, '\n", - " '\"pull_request.assignee.id\": null, '\n", - " '\"pull_request.assignee.type\": null, '\n", - " '\"pull_request.assignee.site_admin\": null, '\n", - " '\"pull_request.milestone.id\": null, '\n", - " '\"pull_request.milestone.number\": null, '\n", - " '\"pull_request.milestone.title\": null, '\n", - " '\"pull_request.milestone.description\": null, '\n", - " '\"pull_request.milestone.creator.login\": null, '\n", - " '\"pull_request.milestone.creator.id\": null, '\n", - " '\"pull_request.milestone.creator.type\": null, '\n", - " '\"pull_request.milestone.creator.site_admin\": null, '\n", - " '\"pull_request.milestone.open_issues\": null, '\n", - " '\"pull_request.milestone.closed_issues\": null, '\n", - " '\"pull_request.milestone.state\": null, '\n", - " '\"pull_request.milestone.created_at\": null, '\n", - " '\"pull_request.milestone.updated_at\": null, '\n", - " '\"pull_request.milestone.due_on\": null, '\n", - " '\"pull_request.milestone.closed_at\": null, '\n", - " '\"pull_request.merged\": false, '\n", - " '\"pull_request.mergeable\": null, '\n", - " '\"pull_request.mergeable_state\": \"unknown\", '\n", - " '\"pull_request.merged_by.login\": null, '\n", - " '\"pull_request.merged_by.id\": null, '\n", - " '\"pull_request.merged_by.type\": null, '\n", - " '\"pull_request.merged_by.site_admin\": null, '\n", - " '\"pull_request.comments\": 0, '\n", - " '\"pull_request.review_comments\": 0, '\n", - " '\"pull_request.commits\": 4, \"pull_request.additions\": '\n", - " '243, \"pull_request.deletions\": 0, '\n", - " '\"pull_request.changed_files\": 2, '\n", - " '\"pull_request.label.id\": null, '\n", - " '\"pull_request.label.name\": null, '\n", - " '\"pull_request.label.color\": null, '\n", - " '\"pull_request.label.default\": null, '\n", - " '\"pull_request.head.label\": \"kubernetes:master\", '\n", - " '\"pull_request.head.ref\": \"master\", '\n", - " '\"pull_request.head.sha\": '\n", - " '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n", - " '\"pull_request.head.user.login\": \"kubernetes\", '\n", - " '\"pull_request.head.user.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.name\": \"website\", '\n", - " '\"pull_request.head.repo.full_name\": '\n", - " '\"kubernetes/website\", '\n", - " '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n", - " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.private\": false, '\n", - " '\"pull_request.head.repo.homepage\": '\n", - " '\"https://kubernetes.io\", '\n", - " '\"pull_request.head.repo.description\": \"Kubernetes '\n", - " 'website and documentation repo: \", '\n", - " '\"pull_request.head.repo.fork\": false, '\n", - " '\"pull_request.head.repo.created_at\": '\n", - " '\"2016-02-10T22:46:48Z\", '\n", - " '\"pull_request.head.repo.updated_at\": '\n", - " '\"2020-11-23T02:09:41Z\", '\n", - " '\"pull_request.head.repo.pushed_at\": '\n", - " '\"2020-11-23T05:12:37Z\", '\n", - " '\"pull_request.head.repo.size\": 319781, '\n", - " '\"pull_request.head.repo.stargazers_count\": 2267, '\n", - " '\"pull_request.head.repo.watchers_count\": 2267, '\n", - " '\"pull_request.head.repo.language\": \"HTML\", '\n", - " '\"pull_request.head.repo.has_issues\": true, '\n", - " '\"pull_request.head.repo.has_projects\": true, '\n", - " '\"pull_request.head.repo.has_downloads\": true, '\n", - " '\"pull_request.head.repo.has_wiki\": true, '\n", - " '\"pull_request.head.repo.has_pages\": false, '\n", - " '\"pull_request.head.repo.forks_count\": 8508, '\n", - " '\"pull_request.head.repo.archived\": false, '\n", - " '\"pull_request.head.repo.disabled\": false, '\n", - " '\"pull_request.head.repo.open_issues_count\": 641, '\n", - " '\"pull_request.head.repo.forks\": 8508, '\n", - " '\"pull_request.head.repo.open_issues\": 641, '\n", - " '\"pull_request.head.repo.watchers\": 2267, '\n", - " '\"pull_request.head.repo.default_branch\": \"master\", '\n", - " '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n", - " '\"pull_request.head.repo.license.spdx_id\": '\n", - " '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n", - " '\"Creative Commons Attribution 4.0 International\", '\n", - " '\"pull_request.base.label\": \"kofj:master\", '\n", - " '\"pull_request.base.ref\": \"master\", '\n", - " '\"pull_request.base.sha\": '\n", - " '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n", - " '\"pull_request.base.user.login\": \"kofj\", '\n", - " '\"pull_request.base.user.type\": \"User\", '\n", - " '\"pull_request.base.repo.name\": \"website\", '\n", - " '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n", - " '\"pull_request.base.repo.owner.login\": \"kofj\", '\n", - " '\"pull_request.base.repo.owner.type\": \"User\", '\n", - " '\"pull_request.base.repo.private\": false, '\n", - " '\"pull_request.base.repo.homepage\": '\n", - " '\"https://kubernetes.io\", '\n", - " '\"pull_request.base.repo.description\": \"Kubernetes '\n", - " 'website and documentation repo: \", '\n", - " '\"pull_request.base.repo.fork\": true, '\n", - " '\"pull_request.base.repo.created_at\": '\n", - " '\"2018-11-24T02:12:25Z\", '\n", - " '\"pull_request.base.repo.updated_at\": '\n", - " '\"2020-11-23T01:58:46Z\", '\n", - " '\"pull_request.base.repo.pushed_at\": '\n", - " '\"2020-11-23T01:58:43Z\", '\n", - " '\"pull_request.base.repo.size\": 286251, '\n", - " '\"pull_request.base.repo.stargazers_count\": 0, '\n", - " '\"pull_request.base.repo.watchers_count\": 0, '\n", - " '\"pull_request.base.repo.language\": \"HTML\", '\n", - " '\"pull_request.base.repo.has_issues\": false, '\n", - " '\"pull_request.base.repo.has_projects\": true, '\n", - " '\"pull_request.base.repo.has_downloads\": true, '\n", - " '\"pull_request.base.repo.has_wiki\": true, '\n", - " '\"pull_request.base.repo.has_pages\": false, '\n", - " '\"pull_request.base.repo.forks_count\": 0, '\n", - " '\"pull_request.base.repo.archived\": false, '\n", - " '\"pull_request.base.repo.disabled\": false, '\n", - " '\"pull_request.base.repo.open_issues_count\": 1, '\n", - " '\"pull_request.base.repo.forks\": 0, '\n", - " '\"pull_request.base.repo.open_issues\": 1, '\n", - " '\"pull_request.base.repo.watchers\": 0, '\n", - " '\"pull_request.base.repo.default_branch\": \"master\", '\n", - " '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n", - " '\"pull_request.base.repo.license.spdx_id\": '\n", - " '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n", - " '\"Creative Commons Attribution 4.0 International\", '\n", - " '\"pull_request.guid\": \"kofj/website/pull/2460\"}, '\n", - " '{\"type\": \"PullRequestEvent\", \"action\": \"closed\", '\n", - " '\"actor.login\": \"pull[bot]\", \"actor.id\": 39814207, '\n", - " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", - " 'null, \"repo.name\": \"kofj/website\", \"repo.id\": '\n", - " '158894695, \"public\": true, \"created_at\": '\n", - " '\"2020-11-23T05:58:50Z\", \"org.id\": null, \"org.login\": '\n", - " 'null, \"pull_request.id\": 525472638, '\n", - " '\"pull_request.number\": 2460, \"pull_request.state\": '\n", - " '\"closed\", \"pull_request.title\": \"[pull] master from '\n", - " 'kubernetes:master\", \"pull_request.body\": \"See '\n", - " '[Commits](/kofj/website/pull/2460/commits) and '\n", - " '[Changes](/kofj/website/pull/2460/files) for more '\n", - " 'details.\\\\n\\\\n-----\\\\nCreated by [ '\n", - " '**pull[bot]**](https://github.com/wei/pull)\\\\n\\\\n_Can '\n", - " 'you help keep this open source service alive? '\n", - " '**[\\\\ud83d\\\\udc96 Please sponsor : '\n", - " ')](https://prod.download/pull-pr-sponsor)**_\", '\n", - " '\"pull_request.user.login\": \"pull[bot]\", '\n", - " '\"pull_request.user.id\": 39814207, '\n", - " '\"pull_request.author_association\": \"NONE\", '\n", - " '\"pull_request.created_at\": \"2020-11-23T05:58:39Z\", '\n", - " '\"pull_request.updated_at\": \"2020-11-23T05:58:50Z\", '\n", - " '\"pull_request.closed_at\": \"2020-11-23T05:58:50Z\", '\n", - " '\"pull_request.merged_at\": \"2020-11-23T05:58:49Z\", '\n", - " '\"pull_request.merge_commit_sha\": '\n", - " '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n", - " '\"pull_request.locked\": false, '\n", - " '\"pull_request.assignee.login\": null, '\n", - " '\"pull_request.assignee.id\": null, '\n", - " '\"pull_request.assignee.type\": null, '\n", - " '\"pull_request.assignee.site_admin\": null, '\n", - " '\"pull_request.milestone.id\": null, '\n", - " '\"pull_request.milestone.number\": null, '\n", - " '\"pull_request.milestone.title\": null, '\n", - " '\"pull_request.milestone.description\": null, '\n", - " '\"pull_request.milestone.creator.login\": null, '\n", - " '\"pull_request.milestone.creator.id\": null, '\n", - " '\"pull_request.milestone.creator.type\": null, '\n", - " '\"pull_request.milestone.creator.site_admin\": null, '\n", - " '\"pull_request.milestone.open_issues\": null, '\n", - " '\"pull_request.milestone.closed_issues\": null, '\n", - " '\"pull_request.milestone.state\": null, '\n", - " '\"pull_request.milestone.created_at\": null, '\n", - " '\"pull_request.milestone.updated_at\": null, '\n", - " '\"pull_request.milestone.due_on\": null, '\n", - " '\"pull_request.milestone.closed_at\": null, '\n", - " '\"pull_request.merged\": true, '\n", - " '\"pull_request.mergeable\": null, '\n", - " '\"pull_request.mergeable_state\": \"unknown\", '\n", - " '\"pull_request.merged_by.login\": \"pull[bot]\", '\n", - " '\"pull_request.merged_by.id\": 39814207, '\n", - " '\"pull_request.merged_by.type\": \"Bot\", '\n", - " '\"pull_request.merged_by.site_admin\": false, '\n", - " '\"pull_request.comments\": 0, '\n", - " '\"pull_request.review_comments\": 0, '\n", - " '\"pull_request.commits\": 4, \"pull_request.additions\": '\n", - " '243, \"pull_request.deletions\": 0, '\n", - " '\"pull_request.changed_files\": 2, '\n", - " '\"pull_request.label.id\": null, '\n", - " '\"pull_request.label.name\": null, '\n", - " '\"pull_request.label.color\": null, '\n", - " '\"pull_request.label.default\": null, '\n", - " '\"pull_request.head.label\": \"kubernetes:master\", '\n", - " '\"pull_request.head.ref\": \"master\", '\n", - " '\"pull_request.head.sha\": '\n", - " '\"fd65678baa464abe7715dbf6df44284506c858a2\", '\n", - " '\"pull_request.head.user.login\": \"kubernetes\", '\n", - " '\"pull_request.head.user.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.name\": \"website\", '\n", - " '\"pull_request.head.repo.full_name\": '\n", - " '\"kubernetes/website\", '\n", - " '\"pull_request.head.repo.owner.login\": \"kubernetes\", '\n", - " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.private\": false, '\n", - " '\"pull_request.head.repo.homepage\": '\n", - " '\"https://kubernetes.io\", '\n", - " '\"pull_request.head.repo.description\": \"Kubernetes '\n", - " 'website and documentation repo: \", '\n", - " '\"pull_request.head.repo.fork\": false, '\n", - " '\"pull_request.head.repo.created_at\": '\n", - " '\"2016-02-10T22:46:48Z\", '\n", - " '\"pull_request.head.repo.updated_at\": '\n", - " '\"2020-11-23T02:09:41Z\", '\n", - " '\"pull_request.head.repo.pushed_at\": '\n", - " '\"2020-11-23T05:12:37Z\", '\n", - " '\"pull_request.head.repo.size\": 319781, '\n", - " '\"pull_request.head.repo.stargazers_count\": 2267, '\n", - " '\"pull_request.head.repo.watchers_count\": 2267, '\n", - " '\"pull_request.head.repo.language\": \"HTML\", '\n", - " '\"pull_request.head.repo.has_issues\": true, '\n", - " '\"pull_request.head.repo.has_projects\": true, '\n", - " '\"pull_request.head.repo.has_downloads\": true, '\n", - " '\"pull_request.head.repo.has_wiki\": true, '\n", - " '\"pull_request.head.repo.has_pages\": false, '\n", - " '\"pull_request.head.repo.forks_count\": 8508, '\n", - " '\"pull_request.head.repo.archived\": false, '\n", - " '\"pull_request.head.repo.disabled\": false, '\n", - " '\"pull_request.head.repo.open_issues_count\": 641, '\n", - " '\"pull_request.head.repo.forks\": 8508, '\n", - " '\"pull_request.head.repo.open_issues\": 641, '\n", - " '\"pull_request.head.repo.watchers\": 2267, '\n", - " '\"pull_request.head.repo.default_branch\": \"master\", '\n", - " '\"pull_request.head.repo.license.key\": \"cc-by-4.0\", '\n", - " '\"pull_request.head.repo.license.spdx_id\": '\n", - " '\"CC-BY-4.0\", \"pull_request.head.repo.license.name\": '\n", - " '\"Creative Commons Attribution 4.0 International\", '\n", - " '\"pull_request.base.label\": \"kofj:master\", '\n", - " '\"pull_request.base.ref\": \"master\", '\n", - " '\"pull_request.base.sha\": '\n", - " '\"97a882c38db18684471447d06dd15c984302e0a7\", '\n", - " '\"pull_request.base.user.login\": \"kofj\", '\n", - " '\"pull_request.base.user.type\": \"User\", '\n", - " '\"pull_request.base.repo.name\": \"website\", '\n", - " '\"pull_request.base.repo.full_name\": \"kofj/website\", '\n", - " '\"pull_request.base.repo.owner.login\": \"kofj\", '\n", - " '\"pull_request.base.repo.owner.type\": \"User\", '\n", - " '\"pull_request.base.repo.private\": false, '\n", - " '\"pull_request.base.repo.homepage\": '\n", - " '\"https://kubernetes.io\", '\n", - " '\"pull_request.base.repo.description\": \"Kubernetes '\n", - " 'website and documentation repo: \", '\n", - " '\"pull_request.base.repo.fork\": true, '\n", - " '\"pull_request.base.repo.created_at\": '\n", - " '\"2018-11-24T02:12:25Z\", '\n", - " '\"pull_request.base.repo.updated_at\": '\n", - " '\"2020-11-23T01:58:46Z\", '\n", - " '\"pull_request.base.repo.pushed_at\": '\n", - " '\"2020-11-23T05:58:46Z\", '\n", - " '\"pull_request.base.repo.size\": 286251, '\n", - " '\"pull_request.base.repo.stargazers_count\": 0, '\n", - " '\"pull_request.base.repo.watchers_count\": 0, '\n", - " '\"pull_request.base.repo.language\": \"HTML\", '\n", - " '\"pull_request.base.repo.has_issues\": false, '\n", - " '\"pull_request.base.repo.has_projects\": true, '\n", - " '\"pull_request.base.repo.has_downloads\": true, '\n", - " '\"pull_request.base.repo.has_wiki\": true, '\n", - " '\"pull_request.base.repo.has_pages\": false, '\n", - " '\"pull_request.base.repo.forks_count\": 0, '\n", - " '\"pull_request.base.repo.archived\": false, '\n", - " '\"pull_request.base.repo.disabled\": false, '\n", - " '\"pull_request.base.repo.open_issues_count\": 0, '\n", - " '\"pull_request.base.repo.forks\": 0, '\n", - " '\"pull_request.base.repo.open_issues\": 0, '\n", - " '\"pull_request.base.repo.watchers\": 0, '\n", - " '\"pull_request.base.repo.default_branch\": \"master\", '\n", - " '\"pull_request.base.repo.license.key\": \"cc-by-4.0\", '\n", - " '\"pull_request.base.repo.license.spdx_id\": '\n", - " '\"CC-BY-4.0\", \"pull_request.base.repo.license.name\": '\n", - " '\"Creative Commons Attribution 4.0 International\", '\n", - " '\"pull_request.guid\": \"kofj/website/pull/2460\"}]',\n", - " 'pull_request.guid': 'kofj/website/pull/2460',\n", - " 'pull_request.issue_events': None}\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "\n", - "pprint(ds[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 412, + "execution_count": 444, "metadata": {}, "outputs": [], "source": [ @@ -625,16 +248,15 @@ " try:\n", " base_data = events[0] if events else reviews[0]\n", " except IndexError:\n", + " # init empty dict\n", + " base_data = {}\n", " if issues:\n", - " base_data = issues_events[0]\n", - " first_event = issues[0][\"events\"][0]\n", - " base_data['pull_request.title'] = first_event[\"title\"]\n", - " print(f'base data keys: {base_data.keys()}')\n", - " base_data[\"repo.name\"] = base_data[\"repo\"]\n", - " base_data[\"org.id\"] = base_data[\"org\"]\n", - " base_data[\"repo.name\"] = base_data[\"repo\"]\n", - " base_data[\"pull_request.number\"] = int(base_data[\"pull_request\"][\"number\"])\n", - " base_data[\"pull_request.user.login\"] = base_data[\"pull_request\"][\"user_login\"]\n", + " base_data = {}\n", + " first_event = issues[0]\n", + " base_data['pull_request.title'] = first_event[\"events\"][0][\"title\"]\n", + " base_data[\"repo.name\"] = first_event[\"repo\"]\n", + " base_data[\"pull_request.number\"] = first_event[\"pull_request\"][\"number\"]\n", + " base_data[\"pull_request.user.login\"] = first_event[\"pull_request\"][\"user_login\"]\n", " print(\"filling PR data from issue event\")\n", " else:\n", " raise IndexError(\"No events for PR\")\n", @@ -660,4356 +282,40 @@ }, { "cell_type": "code", - "execution_count": 413, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'repo'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[412], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n", - "\u001b[0;31mIndexError\u001b[0m: list index out of range", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[413], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n", - "Cell \u001b[0;32mIn[412], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m 172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n", - "\u001b[0;31mKeyError\u001b[0m: 'repo'" - ] - } - ], - "source": [ - "new_row = merge_events(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 411, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "ename": "KeyError", - "evalue": "'repo'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[410], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n", - "\u001b[0;31mIndexError\u001b[0m: list index out of range", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[411], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m small_ds_2 \u001b[39m=\u001b[39m ds\u001b[39m.\u001b[39mselect(\u001b[39mrange\u001b[39m(\u001b[39m1000\u001b[39m))\n\u001b[0;32m----> 2\u001b[0m dd \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events)\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m 583\u001b[0m \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m 539\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m 540\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m 541\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m 542\u001b[0m \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m 543\u001b[0m }\n\u001b[1;32m 544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 3080\u001b[0m \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m 3081\u001b[0m disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m 3082\u001b[0m unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3085\u001b[0m desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 3086\u001b[0m ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m 3088\u001b[0m \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m 3089\u001b[0m shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m 3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m 3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m 3442\u001b[0m \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m 3443\u001b[0m \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single..apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m 3343\u001b[0m additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m 3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m 3346\u001b[0m processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m 3347\u001b[0m k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m 3348\u001b[0m }\n", - "Cell \u001b[0;32mIn[410], line 173\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 171\u001b[0m first_event \u001b[39m=\u001b[39m issues[\u001b[39m0\u001b[39m][\u001b[39m\"\u001b[39m\u001b[39mevents\u001b[39m\u001b[39m\"\u001b[39m][\u001b[39m0\u001b[39m]\n\u001b[1;32m 172\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[0;32m--> 173\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mrepo\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 174\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39morg.id\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39morg\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 175\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n", - "\u001b[0;31mKeyError\u001b[0m: 'repo'" - ] - } - ], - "source": [ - "small_ds_2 = ds.select(range(1000))\n", - "dd = small_ds_2.map(merge_events)" - ] - }, - { - "cell_type": "code", - "execution_count": 405, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['action', 'author', 'comment', 'comment_id', 'description', 'title', 'type', 'created_at'])" - ] - }, - "execution_count": 405, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues_events[0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 366, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'pull_request.guid': 'karen-kua/COVID-19_Tracker/pull/15',\n", - " 'pull_request.code_review_events': None,\n", - " 'pull_request.events': None,\n", - " 'pull_request.issue_events': '{\"repo\": \"karen-kua/COVID-19_Tracker\", \"org\": null, \"issue_id\": 1018615993, \"issue_number\": 15, \"pull_request\": {\"number\": 15.0, \"repo\": \"COVID-19_Tracker\", \"user_login\": \"karen-kua\"}, \"events\": [{\"action\": \"opened\", \"author\": \"dependabot[bot]\", \"comment\": null, \"comment_id\": null, \"datetime\": \"2021-10-06T15:46:43Z\", \"description\": \"Bumps [url-parse](https://github.com/unshiftio/url-parse) from 1.4.7 to 1.5.3.\\\\n
\\\\nCommits\\\\n
    \\\\n
  • ad44493 [dist] 1.5.3
  • \\\\n
  • c798461 [fix] Fix host parsing for file URLs (#210)
  • \\\\n
  • 201034b [dist] 1.5.2
  • \\\\n
  • 2d9ac2c [fix] Sanitize only special URLs (#209)
  • \\\\n
  • fb128af [fix] Use \\'null\\' as origin for non special URLs
  • \\\\n
  • fed6d9e [fix] Add a leading slash only if the URL is special
  • \\\\n
  • 94872e7 [fix] Do not incorrectly set the slashes property to true
  • \\\\n
  • 81ab967 [fix] Ignore slashes after the protocol for special URLs
  • \\\\n
  • ee22050 [ci] Use GitHub Actions
  • \\\\n
  • d2979b5 [fix] Special case the file: protocol (#204)
  • \\\\n
  • Additional commits viewable in compare view
  • \\\\n
\\\\n
\\\\n
\\\\n\\\\n\\\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=url-parse&package-manager=npm_and_yarn&previous-version=1.4.7&new-version=1.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\\\n\\\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\\\n\\\\n[//]: # (dependabot-automerge-start)\\\\n[//]: # (dependabot-automerge-end)\\\\n\\\\n---\\\\n\\\\n
\\\\nDependabot commands and options\\\\n
\\\\n\\\\nYou can trigger Dependabot actions by commenting on this PR:\\\\n- `@dependabot rebase` will rebase this PR\\\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\\\n- `@dependabot merge` will merge this PR after your CI passes on it\\\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\\\n- `@dependabot reopen` will reopen this PR if it is closed\\\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\\\n\\\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/azukimochi/COVID-19_Tracker/network/alerts).\\\\n\\\\n
\", \"title\": \"Bump url-parse from 1.4.7 to 1.5.3\", \"type\": \"issue\"}, {\"action\": \"created\", \"author\": \"dependabot[bot]\", \"comment\": \"Superseded by #17.\", \"comment_id\": 1045459471.0, \"datetime\": \"2022-02-19 00:53:17+00:00\", \"description\": null, \"title\": null, \"type\": \"comment\"}]}',\n", - " 'bucket': '940',\n", - " '__index_level_0__': 72946}" - ] - }, - "execution_count": 366, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row" - ] - }, - { - "cell_type": "code", - "execution_count": 360, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])" - ] - }, - "execution_count": 360, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues[0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 361, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'number': 15.0, 'repo': 'COVID-19_Tracker', 'user_login': 'karen-kua'}" - ] - }, - "execution_count": 361, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues[0][\"pull_request\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 351, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "small_ds_2 = ds.select(range(500))" - ] - }, - { - "cell_type": "code", - "execution_count": 398, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "ename": "KeyError", - "evalue": "'events'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[396], line 167\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 167\u001b[0m base_data \u001b[39m=\u001b[39m events[\u001b[39m0\u001b[39m] \u001b[39mif\u001b[39;00m events \u001b[39melse\u001b[39;00m reviews[\u001b[39m0\u001b[39;49m]\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n", - "\u001b[0;31mIndexError\u001b[0m: list index out of range", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[398], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m merged_ds \u001b[39m=\u001b[39m small_ds_2\u001b[39m.\u001b[39;49mmap(merge_events, remove_columns\u001b[39m=\u001b[39;49m[\u001b[39m\"\u001b[39;49m\u001b[39mpull_request.events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.code_review_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mpull_request.issue_events\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m'\u001b[39;49m\u001b[39m__index_level_0__\u001b[39;49m\u001b[39m'\u001b[39;49m,\u001b[39m'\u001b[39;49m\u001b[39mpull_request.guid\u001b[39;49m\u001b[39m'\u001b[39;49m])\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:580\u001b[0m, in \u001b[0;36mtransmit_tasks..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 578\u001b[0m \u001b[39mself\u001b[39m: \u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m=\u001b[39m kwargs\u001b[39m.\u001b[39mpop(\u001b[39m\"\u001b[39m\u001b[39mself\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 579\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 580\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 581\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 582\u001b[0m \u001b[39mfor\u001b[39;00m dataset \u001b[39min\u001b[39;00m datasets:\n\u001b[1;32m 583\u001b[0m \u001b[39m# Remove task templates if a column mapping of the template is no longer valid\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:545\u001b[0m, in \u001b[0;36mtransmit_format..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 538\u001b[0m self_format \u001b[39m=\u001b[39m {\n\u001b[1;32m 539\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtype\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_type,\n\u001b[1;32m 540\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mformat_kwargs\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_kwargs,\n\u001b[1;32m 541\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcolumns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_format_columns,\n\u001b[1;32m 542\u001b[0m \u001b[39m\"\u001b[39m\u001b[39moutput_all_columns\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_output_all_columns,\n\u001b[1;32m 543\u001b[0m }\n\u001b[1;32m 544\u001b[0m \u001b[39m# apply actual function\u001b[39;00m\n\u001b[0;32m--> 545\u001b[0m out: Union[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mDatasetDict\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m func(\u001b[39mself\u001b[39;49m, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 546\u001b[0m datasets: List[\u001b[39m\"\u001b[39m\u001b[39mDataset\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mlist\u001b[39m(out\u001b[39m.\u001b[39mvalues()) \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(out, \u001b[39mdict\u001b[39m) \u001b[39melse\u001b[39;00m [out]\n\u001b[1;32m 547\u001b[0m \u001b[39m# re-apply format to the output\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3087\u001b[0m, in \u001b[0;36mDataset.map\u001b[0;34m(self, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, num_proc, suffix_template, new_fingerprint, desc)\u001b[0m\n\u001b[1;32m 3079\u001b[0m \u001b[39mif\u001b[39;00m transformed_dataset \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 3080\u001b[0m \u001b[39mwith\u001b[39;00m logging\u001b[39m.\u001b[39mtqdm(\n\u001b[1;32m 3081\u001b[0m disable\u001b[39m=\u001b[39m\u001b[39mnot\u001b[39;00m logging\u001b[39m.\u001b[39mis_progress_bar_enabled(),\n\u001b[1;32m 3082\u001b[0m unit\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m examples\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3085\u001b[0m desc\u001b[39m=\u001b[39mdesc \u001b[39mor\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mMap\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 3086\u001b[0m ) \u001b[39mas\u001b[39;00m pbar:\n\u001b[0;32m-> 3087\u001b[0m \u001b[39mfor\u001b[39;00m rank, done, content \u001b[39min\u001b[39;00m Dataset\u001b[39m.\u001b[39m_map_single(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mdataset_kwargs):\n\u001b[1;32m 3088\u001b[0m \u001b[39mif\u001b[39;00m done:\n\u001b[1;32m 3089\u001b[0m shards_done \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3441\u001b[0m, in \u001b[0;36mDataset._map_single\u001b[0;34m(shard, function, with_indices, with_rank, input_columns, batched, batch_size, drop_last_batch, remove_columns, keep_in_memory, cache_file_name, writer_batch_size, features, disable_nullable, fn_kwargs, new_fingerprint, rank, offset)\u001b[0m\n\u001b[1;32m 3439\u001b[0m _time \u001b[39m=\u001b[39m time\u001b[39m.\u001b[39mtime()\n\u001b[1;32m 3440\u001b[0m \u001b[39mfor\u001b[39;00m i, example \u001b[39min\u001b[39;00m shard_iterable:\n\u001b[0;32m-> 3441\u001b[0m example \u001b[39m=\u001b[39m apply_function_on_filtered_inputs(example, i, offset\u001b[39m=\u001b[39;49moffset)\n\u001b[1;32m 3442\u001b[0m \u001b[39mif\u001b[39;00m update_data:\n\u001b[1;32m 3443\u001b[0m \u001b[39mif\u001b[39;00m i \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n", - "File \u001b[0;32m~/Desktop/HF_bigcode/.venv/lib/python3.11/site-packages/datasets/arrow_dataset.py:3344\u001b[0m, in \u001b[0;36mDataset._map_single..apply_function_on_filtered_inputs\u001b[0;34m(pa_inputs, indices, check_same_num_examples, offset)\u001b[0m\n\u001b[1;32m 3342\u001b[0m \u001b[39mif\u001b[39;00m with_rank:\n\u001b[1;32m 3343\u001b[0m additional_args \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (rank,)\n\u001b[0;32m-> 3344\u001b[0m processed_inputs \u001b[39m=\u001b[39m function(\u001b[39m*\u001b[39;49mfn_args, \u001b[39m*\u001b[39;49madditional_args, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mfn_kwargs)\n\u001b[1;32m 3345\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(processed_inputs, LazyDict):\n\u001b[1;32m 3346\u001b[0m processed_inputs \u001b[39m=\u001b[39m {\n\u001b[1;32m 3347\u001b[0m k: v \u001b[39mfor\u001b[39;00m k, v \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mdata\u001b[39m.\u001b[39mitems() \u001b[39mif\u001b[39;00m k \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m processed_inputs\u001b[39m.\u001b[39mkeys_to_format\n\u001b[1;32m 3348\u001b[0m }\n", - "Cell \u001b[0;32mIn[396], line 170\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mIndexError\u001b[39;00m:\n\u001b[1;32m 169\u001b[0m base_data \u001b[39m=\u001b[39m issues_events[\u001b[39m0\u001b[39m]\n\u001b[0;32m--> 170\u001b[0m first_event \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39;49m\u001b[39mevents\u001b[39;49m\u001b[39m\"\u001b[39;49m][\u001b[39m0\u001b[39m]\n\u001b[1;32m 171\u001b[0m base_data[\u001b[39m'\u001b[39m\u001b[39mpull_request.title\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m first_event[\u001b[39m\"\u001b[39m\u001b[39mtitle\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 172\u001b[0m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo.name\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m base_data[\u001b[39m\"\u001b[39m\u001b[39mrepo\u001b[39m\u001b[39m\"\u001b[39m]\n", - "\u001b[0;31mKeyError\u001b[0m: 'events'" - ] - } - ], - "source": [ - "merged_ds = small_ds_2.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 31.42ba/s]\n", - "Upload 1 LFS files: 100%|██████████| 1/1 [00:10<00:00, 10.30s/it]\n", - "Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:11<00:00, 11.45s/it]\n" - ] - } - ], - "source": [ - "merged_ds.push_to_hub(\"loubnabnl/code_reviews_3\")" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading readme: 100%|██████████| 5.88k/5.88k [00:00<00:00, 3.76MB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading and preparing dataset None/None to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading data: 100%|██████████| 16.1M/16.1M [00:00<00:00, 17.4MB/s]\n", - "Downloading data files: 100%|██████████| 1/1 [00:02<00:00, 2.65s/it]\n", - "Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 676.50it/s]\n", - " \r" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset parquet downloaded and prepared to /Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--clean_prs2-50c7cc07186d2bb2/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.\n" - ] - }, - { - "data": { - "text/plain": [ - "Dataset({\n", - " features: ['bucket', 'pull_request_info', 'head_repo_info', 'base_repo_info', 'events'],\n", - " num_rows: 10000\n", - "})" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n", - "ds" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'__index_level_0__': 1028,\n", - " 'bucket': None,\n", - " 'pull_request.code_review_events': None,\n", - " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n", - " '\"actor.login\": \"M-Davies\", \"actor.id\": 25231953, '\n", - " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", - " 'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n", - " '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n", - " '\"2020-05-28T09:45:30Z\", \"org.id\": 1673867, '\n", - " '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n", - " '424372800, \"pull_request.number\": 1787, '\n", - " '\"pull_request.state\": \"open\", \"pull_request.title\": '\n", - " '\"Revert \\'Fire installer failure on all failed '\n", - " 'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n", - " 'lot of false positives or just doesnt work overall. '\n", - " 'Better to just remove for '\n", - " 'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n", - " '\", \"pull_request.user.login\": '\n", - " '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n", - " '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n", - " '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n", - " '\"pull_request.updated_at\": \"2020-05-28T09:45:30Z\", '\n", - " '\"pull_request.closed_at\": null, '\n", - " '\"pull_request.merged_at\": null, '\n", - " '\"pull_request.merge_commit_sha\": null, '\n", - " '\"pull_request.locked\": false, '\n", - " '\"pull_request.assignee.login\": null, '\n", - " '\"pull_request.assignee.id\": null, '\n", - " '\"pull_request.assignee.type\": null, '\n", - " '\"pull_request.assignee.site_admin\": null, '\n", - " '\"pull_request.milestone.id\": null, '\n", - " '\"pull_request.milestone.number\": null, '\n", - " '\"pull_request.milestone.title\": null, '\n", - " '\"pull_request.milestone.description\": null, '\n", - " '\"pull_request.milestone.creator.login\": null, '\n", - " '\"pull_request.milestone.creator.id\": null, '\n", - " '\"pull_request.milestone.creator.type\": null, '\n", - " '\"pull_request.milestone.creator.site_admin\": null, '\n", - " '\"pull_request.milestone.open_issues\": null, '\n", - " '\"pull_request.milestone.closed_issues\": null, '\n", - " '\"pull_request.milestone.state\": null, '\n", - " '\"pull_request.milestone.created_at\": null, '\n", - " '\"pull_request.milestone.updated_at\": null, '\n", - " '\"pull_request.milestone.due_on\": null, '\n", - " '\"pull_request.milestone.closed_at\": null, '\n", - " '\"pull_request.merged\": false, '\n", - " '\"pull_request.mergeable\": null, '\n", - " '\"pull_request.mergeable_state\": \"unknown\", '\n", - " '\"pull_request.merged_by.login\": null, '\n", - " '\"pull_request.merged_by.id\": null, '\n", - " '\"pull_request.merged_by.type\": null, '\n", - " '\"pull_request.merged_by.site_admin\": null, '\n", - " '\"pull_request.comments\": 0, '\n", - " '\"pull_request.review_comments\": 0, '\n", - " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", - " '4, \"pull_request.deletions\": 6, '\n", - " '\"pull_request.changed_files\": 1, '\n", - " '\"pull_request.label.id\": null, '\n", - " '\"pull_request.label.name\": null, '\n", - " '\"pull_request.label.color\": null, '\n", - " '\"pull_request.label.default\": null, '\n", - " '\"pull_request.head.label\": \"M-Davies:revert\", '\n", - " '\"pull_request.head.ref\": \"revert\", '\n", - " '\"pull_request.head.sha\": '\n", - " '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n", - " '\"pull_request.head.user.login\": \"M-Davies\", '\n", - " '\"pull_request.head.user.type\": \"User\", '\n", - " '\"pull_request.head.repo.name\": \"openjdk-build\", '\n", - " '\"pull_request.head.repo.full_name\": '\n", - " '\"M-Davies/openjdk-build\", '\n", - " '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n", - " '\"pull_request.head.repo.owner.type\": \"User\", '\n", - " '\"pull_request.head.repo.private\": false, '\n", - " '\"pull_request.head.repo.homepage\": \"\", '\n", - " '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n", - " 'community OpenJDK build scripts - common across all '\n", - " 'releases/versions\", \"pull_request.head.repo.fork\": '\n", - " 'true, \"pull_request.head.repo.created_at\": '\n", - " '\"2019-11-29T09:24:43Z\", '\n", - " '\"pull_request.head.repo.updated_at\": '\n", - " '\"2020-05-27T14:45:16Z\", '\n", - " '\"pull_request.head.repo.pushed_at\": '\n", - " '\"2020-05-27T14:45:13Z\", '\n", - " '\"pull_request.head.repo.size\": 2383, '\n", - " '\"pull_request.head.repo.stargazers_count\": 0, '\n", - " '\"pull_request.head.repo.watchers_count\": 0, '\n", - " '\"pull_request.head.repo.language\": \"Shell\", '\n", - " '\"pull_request.head.repo.has_issues\": false, '\n", - " '\"pull_request.head.repo.has_projects\": true, '\n", - " '\"pull_request.head.repo.has_downloads\": true, '\n", - " '\"pull_request.head.repo.has_wiki\": true, '\n", - " '\"pull_request.head.repo.has_pages\": false, '\n", - " '\"pull_request.head.repo.forks_count\": 0, '\n", - " '\"pull_request.head.repo.archived\": false, '\n", - " '\"pull_request.head.repo.disabled\": false, '\n", - " '\"pull_request.head.repo.open_issues_count\": 0, '\n", - " '\"pull_request.head.repo.forks\": 0, '\n", - " '\"pull_request.head.repo.open_issues\": 0, '\n", - " '\"pull_request.head.repo.watchers\": 0, '\n", - " '\"pull_request.head.repo.default_branch\": \"master\", '\n", - " '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n", - " '\"pull_request.head.repo.license.spdx_id\": '\n", - " '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n", - " '\"Apache License 2.0\", \"pull_request.base.label\": '\n", - " '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n", - " '\"master\", \"pull_request.base.sha\": '\n", - " '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n", - " '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n", - " '\"pull_request.base.user.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.name\": \"openjdk-build\", '\n", - " '\"pull_request.base.repo.full_name\": '\n", - " '\"AdoptOpenJDK/openjdk-build\", '\n", - " '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n", - " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.private\": false, '\n", - " '\"pull_request.base.repo.homepage\": \"\", '\n", - " '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n", - " 'community OpenJDK build scripts - common across all '\n", - " 'releases/versions\", \"pull_request.base.repo.fork\": '\n", - " 'false, \"pull_request.base.repo.created_at\": '\n", - " '\"2017-03-17T09:31:50Z\", '\n", - " '\"pull_request.base.repo.updated_at\": '\n", - " '\"2020-05-28T07:45:12Z\", '\n", - " '\"pull_request.base.repo.pushed_at\": '\n", - " '\"2020-05-27T14:18:11Z\", '\n", - " '\"pull_request.base.repo.size\": 2234, '\n", - " '\"pull_request.base.repo.stargazers_count\": 620, '\n", - " '\"pull_request.base.repo.watchers_count\": 620, '\n", - " '\"pull_request.base.repo.language\": \"Shell\", '\n", - " '\"pull_request.base.repo.has_issues\": true, '\n", - " '\"pull_request.base.repo.has_projects\": true, '\n", - " '\"pull_request.base.repo.has_downloads\": true, '\n", - " '\"pull_request.base.repo.has_wiki\": true, '\n", - " '\"pull_request.base.repo.has_pages\": false, '\n", - " '\"pull_request.base.repo.forks_count\": 137, '\n", - " '\"pull_request.base.repo.archived\": false, '\n", - " '\"pull_request.base.repo.disabled\": false, '\n", - " '\"pull_request.base.repo.open_issues_count\": 166, '\n", - " '\"pull_request.base.repo.forks\": 137, '\n", - " '\"pull_request.base.repo.open_issues\": 166, '\n", - " '\"pull_request.base.repo.watchers\": 620, '\n", - " '\"pull_request.base.repo.default_branch\": \"master\", '\n", - " '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n", - " '\"pull_request.base.repo.license.spdx_id\": '\n", - " '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n", - " '\"Apache License 2.0\", \"pull_request.guid\": '\n", - " '\"AdoptOpenJDK/openjdk-build/pull/1787\"}, {\"type\": '\n", - " '\"PullRequestEvent\", \"action\": \"closed\", '\n", - " '\"actor.login\": \"sxa\", \"actor.id\": 6487691, '\n", - " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", - " 'null, \"repo.name\": \"AdoptOpenJDK/openjdk-build\", '\n", - " '\"repo.id\": 85294562, \"public\": true, \"created_at\": '\n", - " '\"2020-05-28T09:51:49Z\", \"org.id\": 1673867, '\n", - " '\"org.login\": \"AdoptOpenJDK\", \"pull_request.id\": '\n", - " '424372800, \"pull_request.number\": 1787, '\n", - " '\"pull_request.state\": \"closed\", \"pull_request.title\": '\n", - " '\"Revert \\'Fire installer failure on all failed '\n", - " 'results\\'\", \"pull_request.body\": \"* Seems to cause a '\n", - " 'lot of false positives or just doesnt work overall. '\n", - " 'Better to just remove for '\n", - " 'now\\\\r\\\\n\\\\r\\\\nSigned-off-by: Morgan Davies '\n", - " '\", \"pull_request.user.login\": '\n", - " '\"M-Davies\", \"pull_request.user.id\": 25231953, '\n", - " '\"pull_request.author_association\": \"CONTRIBUTOR\", '\n", - " '\"pull_request.created_at\": \"2020-05-28T09:45:30Z\", '\n", - " '\"pull_request.updated_at\": \"2020-05-28T09:51:48Z\", '\n", - " '\"pull_request.closed_at\": \"2020-05-28T09:51:48Z\", '\n", - " '\"pull_request.merged_at\": \"2020-05-28T09:51:48Z\", '\n", - " '\"pull_request.merge_commit_sha\": '\n", - " '\"4c3495c6f008459ca1c276477c5f968e9dcd7c6b\", '\n", - " '\"pull_request.locked\": false, '\n", - " '\"pull_request.assignee.login\": null, '\n", - " '\"pull_request.assignee.id\": null, '\n", - " '\"pull_request.assignee.type\": null, '\n", - " '\"pull_request.assignee.site_admin\": null, '\n", - " '\"pull_request.milestone.id\": null, '\n", - " '\"pull_request.milestone.number\": null, '\n", - " '\"pull_request.milestone.title\": null, '\n", - " '\"pull_request.milestone.description\": null, '\n", - " '\"pull_request.milestone.creator.login\": null, '\n", - " '\"pull_request.milestone.creator.id\": null, '\n", - " '\"pull_request.milestone.creator.type\": null, '\n", - " '\"pull_request.milestone.creator.site_admin\": null, '\n", - " '\"pull_request.milestone.open_issues\": null, '\n", - " '\"pull_request.milestone.closed_issues\": null, '\n", - " '\"pull_request.milestone.state\": null, '\n", - " '\"pull_request.milestone.created_at\": null, '\n", - " '\"pull_request.milestone.updated_at\": null, '\n", - " '\"pull_request.milestone.due_on\": null, '\n", - " '\"pull_request.milestone.closed_at\": null, '\n", - " '\"pull_request.merged\": true, '\n", - " '\"pull_request.mergeable\": null, '\n", - " '\"pull_request.mergeable_state\": \"unknown\", '\n", - " '\"pull_request.merged_by.login\": \"sxa\", '\n", - " '\"pull_request.merged_by.id\": 6487691, '\n", - " '\"pull_request.merged_by.type\": \"User\", '\n", - " '\"pull_request.merged_by.site_admin\": false, '\n", - " '\"pull_request.comments\": 0, '\n", - " '\"pull_request.review_comments\": 0, '\n", - " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", - " '4, \"pull_request.deletions\": 6, '\n", - " '\"pull_request.changed_files\": 1, '\n", - " '\"pull_request.label.id\": null, '\n", - " '\"pull_request.label.name\": null, '\n", - " '\"pull_request.label.color\": null, '\n", - " '\"pull_request.label.default\": null, '\n", - " '\"pull_request.head.label\": \"M-Davies:revert\", '\n", - " '\"pull_request.head.ref\": \"revert\", '\n", - " '\"pull_request.head.sha\": '\n", - " '\"023faba7db4130d746f68e6b4fb26170a3834254\", '\n", - " '\"pull_request.head.user.login\": \"M-Davies\", '\n", - " '\"pull_request.head.user.type\": \"User\", '\n", - " '\"pull_request.head.repo.name\": \"openjdk-build\", '\n", - " '\"pull_request.head.repo.full_name\": '\n", - " '\"M-Davies/openjdk-build\", '\n", - " '\"pull_request.head.repo.owner.login\": \"M-Davies\", '\n", - " '\"pull_request.head.repo.owner.type\": \"User\", '\n", - " '\"pull_request.head.repo.private\": false, '\n", - " '\"pull_request.head.repo.homepage\": \"\", '\n", - " '\"pull_request.head.repo.description\": \"AdoptOpenJDK '\n", - " 'community OpenJDK build scripts - common across all '\n", - " 'releases/versions\", \"pull_request.head.repo.fork\": '\n", - " 'true, \"pull_request.head.repo.created_at\": '\n", - " '\"2019-11-29T09:24:43Z\", '\n", - " '\"pull_request.head.repo.updated_at\": '\n", - " '\"2020-05-27T14:45:16Z\", '\n", - " '\"pull_request.head.repo.pushed_at\": '\n", - " '\"2020-05-28T09:46:04Z\", '\n", - " '\"pull_request.head.repo.size\": 2383, '\n", - " '\"pull_request.head.repo.stargazers_count\": 0, '\n", - " '\"pull_request.head.repo.watchers_count\": 0, '\n", - " '\"pull_request.head.repo.language\": \"Shell\", '\n", - " '\"pull_request.head.repo.has_issues\": false, '\n", - " '\"pull_request.head.repo.has_projects\": true, '\n", - " '\"pull_request.head.repo.has_downloads\": true, '\n", - " '\"pull_request.head.repo.has_wiki\": true, '\n", - " '\"pull_request.head.repo.has_pages\": false, '\n", - " '\"pull_request.head.repo.forks_count\": 0, '\n", - " '\"pull_request.head.repo.archived\": false, '\n", - " '\"pull_request.head.repo.disabled\": false, '\n", - " '\"pull_request.head.repo.open_issues_count\": 0, '\n", - " '\"pull_request.head.repo.forks\": 0, '\n", - " '\"pull_request.head.repo.open_issues\": 0, '\n", - " '\"pull_request.head.repo.watchers\": 0, '\n", - " '\"pull_request.head.repo.default_branch\": \"master\", '\n", - " '\"pull_request.head.repo.license.key\": \"apache-2.0\", '\n", - " '\"pull_request.head.repo.license.spdx_id\": '\n", - " '\"Apache-2.0\", \"pull_request.head.repo.license.name\": '\n", - " '\"Apache License 2.0\", \"pull_request.base.label\": '\n", - " '\"AdoptOpenJDK:master\", \"pull_request.base.ref\": '\n", - " '\"master\", \"pull_request.base.sha\": '\n", - " '\"32a19e7a01b4d50cc8c10f8f675a2aeb2ffeaefb\", '\n", - " '\"pull_request.base.user.login\": \"AdoptOpenJDK\", '\n", - " '\"pull_request.base.user.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.name\": \"openjdk-build\", '\n", - " '\"pull_request.base.repo.full_name\": '\n", - " '\"AdoptOpenJDK/openjdk-build\", '\n", - " '\"pull_request.base.repo.owner.login\": \"AdoptOpenJDK\", '\n", - " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.private\": false, '\n", - " '\"pull_request.base.repo.homepage\": \"\", '\n", - " '\"pull_request.base.repo.description\": \"AdoptOpenJDK '\n", - " 'community OpenJDK build scripts - common across all '\n", - " 'releases/versions\", \"pull_request.base.repo.fork\": '\n", - " 'false, \"pull_request.base.repo.created_at\": '\n", - " '\"2017-03-17T09:31:50Z\", '\n", - " '\"pull_request.base.repo.updated_at\": '\n", - " '\"2020-05-28T07:45:12Z\", '\n", - " '\"pull_request.base.repo.pushed_at\": '\n", - " '\"2020-05-28T09:51:48Z\", '\n", - " '\"pull_request.base.repo.size\": 2234, '\n", - " '\"pull_request.base.repo.stargazers_count\": 620, '\n", - " '\"pull_request.base.repo.watchers_count\": 620, '\n", - " '\"pull_request.base.repo.language\": \"Shell\", '\n", - " '\"pull_request.base.repo.has_issues\": true, '\n", - " '\"pull_request.base.repo.has_projects\": true, '\n", - " '\"pull_request.base.repo.has_downloads\": true, '\n", - " '\"pull_request.base.repo.has_wiki\": true, '\n", - " '\"pull_request.base.repo.has_pages\": false, '\n", - " '\"pull_request.base.repo.forks_count\": 137, '\n", - " '\"pull_request.base.repo.archived\": false, '\n", - " '\"pull_request.base.repo.disabled\": false, '\n", - " '\"pull_request.base.repo.open_issues_count\": 165, '\n", - " '\"pull_request.base.repo.forks\": 137, '\n", - " '\"pull_request.base.repo.open_issues\": 165, '\n", - " '\"pull_request.base.repo.watchers\": 620, '\n", - " '\"pull_request.base.repo.default_branch\": \"master\", '\n", - " '\"pull_request.base.repo.license.key\": \"apache-2.0\", '\n", - " '\"pull_request.base.repo.license.spdx_id\": '\n", - " '\"Apache-2.0\", \"pull_request.base.repo.license.name\": '\n", - " '\"Apache License 2.0\", \"pull_request.guid\": '\n", - " '\"AdoptOpenJDK/openjdk-build/pull/1787\"}]',\n", - " 'pull_request.guid': 'AdoptOpenJDK/openjdk-build/pull/1787',\n", - " 'pull_request.issue_events': None}\n" - ] - } - ], - "source": [ - "from pprint import pprint\n", - "pprint(small_ds[50])" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'action': 'opened',\n", - " 'actor.id': 25231953,\n", - " 'actor.login': 'M-Davies',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2020, 5, 28, 9, 45, 30, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'closed',\n", - " 'actor.id': 6487691,\n", - " 'actor.login': 'sxa',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2020, 5, 28, 9, 51, 49, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.merged_by.login': 'sxa',\n", - " 'pull_request.merged_by.type': 'User',\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}]\n" - ] - } - ], - "source": [ - "pprint(merged_ds[50][\"events\"])" + "merged_ds = ds.map(merge_events, remove_columns=[\"pull_request.events\", \"pull_request.code_review_events\", \"pull_request.issue_events\", '__index_level_0__','pull_request.guid'])" ] }, { "cell_type": "code", - "execution_count": 222, + "execution_count": 449, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Found cached dataset parquet (/Users/loubnabenallal/.cache/huggingface/datasets/loubnabnl___parquet/loubnabnl--code_reviews_3-c3e4ac735edf14b4/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)\n" + "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.20ba/s]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.10s/it]\n", + "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 33.12ba/s]s/it]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00, 9.55s/it]\n", + "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 39.47ba/s]s/it]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:09<00:00, 9.99s/it]\n", + "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 37.45ba/s]s/it]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:23<00:00, 23.74s/it]\n", + "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:02<00:00, 34.84ba/s]s/it]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.48s/it]\n", + "Creating parquet from Arrow format: 100%|██████████| 84/84 [00:03<00:00, 26.04ba/s]s/it]\n", + "Upload 1 LFS files: 100%|██████████| 1/1 [00:22<00:00, 22.62s/it]\n", + "Pushing dataset shards to the dataset hub: 100%|██████████| 6/6 [02:10<00:00, 21.69s/it]\n" ] } ], "source": [ - "ds = load_dataset(\"loubnabnl/code_reviews_3\", split=\"train\")\n", - "size = len(ds)" - ] - }, - { - "cell_type": "code", - "execution_count": 223, - "metadata": {}, - "outputs": [], - "source": [ - "sample = ds[1470]\n", - "events = sample[\"events\"]\n", - "grouped_events = create_grouped_events(events)\n", - "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pprint import pprint\n", - "\n", - "pprint(small_ds[50])" - ] - }, - { - "cell_type": "code", - "execution_count": 224, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "📝 **Title**: Fix @inheritDocs behavior
\n", - " 📦 **GitHub Repo**: Azure/azure-sdk-for-java, PR Number: 26816, ID: 836647691.
\n", - " Link: [https://github.com/Azure/azure-sdk-for-java/pull/26816](https://github.com/Azure/azure-sdk-for-java/pull/26816)" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AttributeDetail
🧾 PR Typeissue
🟢 PR Stateopen
👤 PR Authorkasobol-msft
🏷️ Head Branchref: kasobol-msft-patch-1, label: Azure:kasobol-msft-patch-1
🌳 Base Branchmain
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Make sure that dependency sources are included in javadoc generation.\r\n", - "\r\n", - "Fixes https://github.com/Azure/azure-sdk-for-java/issues/26814" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def get_pr_info(sample):\n", - " pr_info = sample[\"pull_request_info\"]\n", - " head_info = sample[\"head_repo_info\"]\n", - " base_info = sample[\"base_repo_info\"]\n", - " events = sample[\"events\"]\n", - "\n", - " gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", - "\n", - " header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}
\n", - " 📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.
\n", - " Link: [{gh_link}]({gh_link})\"\"\"\n", - " pr_info_html = f\"\"\"\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AttributeDetail
🧾 PR Type{events[0]['type']}
🟢 PR State{pr_info['pull_request.state']}
👤 PR Author{pr_info['pull_request.user.login']}
🏷️ Head Branchref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}
🌳 Base Branch{base_info['pull_request.base.ref']}
\n", - " \"\"\"\n", - " return header, pr_info_html, pr_info['pull_request.body']\n", - "\n", - "from IPython.display import HTML, display\n", - "display(HTML(get_pr_info(sample)[0]))\n", - "display(HTML(get_pr_info(sample)[1]))\n", - "display(HTML(get_pr_info(sample)[2]))" - ] - }, - { - "cell_type": "code", - "execution_count": 308, - "metadata": {}, - "outputs": [], - "source": [ - "sample = ds[4]\n", - "events = sample[\"events\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 309, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 309, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(events)" - ] - }, - { - "cell_type": "code", - "execution_count": 310, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'action': 'opened',\n", - " 'actor.id': 39814207,\n", - " 'actor.login': 'pull[bot]',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2022, 10, 10, 10, 57, 41, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'closed',\n", - " 'actor.id': 39814207,\n", - " 'actor.login': 'pull[bot]',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2022, 10, 10, 11, 1, 28, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.merged_by.login': 'pull[bot]',\n", - " 'pull_request.merged_by.type': 'Bot',\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}]\n" - ] - } - ], - "source": [ - "pprint(events)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "import uuid\n", - "\n", - "def create_grouped_events(events):\n", - " df = pd.DataFrame(events)\n", - " # Ensure it's in datetime format\n", - " df['created_at'] = pd.to_datetime(df['created_at'])\n", - " # Create a new column 'uuid' initialized with None\n", - " df['uuid'] = None\n", - " # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n", - " mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n", - " df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n", - " # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n", - " grouped_events = [group.drop(columns='uuid').to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n", - " return grouped_events\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 229, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "846\n" - ] - } - ], - "source": [ - "for i in range(len(ds)):\n", - " e = ds[i]\n", - " if e[\"events\"][0][\"comment.diff_hunk\"]:\n", - " print(i)\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 299, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[{'action': 'opened',\n", - " 'actor.id': 1753262,\n", - " 'actor.login': 'mo9a7i',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'created',\n", - " 'actor.id': 1753262,\n", - " 'actor.login': 'mo9a7i',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2022, 5, 5, 4, 35, 2, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': 'MEMBER',\n", - " 'review.body': 'looks fine',\n", - " 'review.commit_id': 'ba75444d1ada77cf5f3f06cd74b6320bab8db54b',\n", - " 'review.id': 962846794,\n", - " 'review.state': 'commented',\n", - " 'review.submitted_at': '2022-05-05T04:35:02Z',\n", - " 'type': 'PullRequestReviewEvent',\n", - " 'user.login': 'mo9a7i',\n", - " 'user.type': 'User'},\n", - " {'action': 'closed',\n", - " 'actor.id': 1753262,\n", - " 'actor.login': 'mo9a7i',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2022, 5, 5, 4, 35, 3, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.merged_by.login': 'mo9a7i',\n", - " 'pull_request.merged_by.type': 'User',\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}]\n" - ] - } - ], - "source": [ - "pprint(events)" - ] - }, - { - "cell_type": "code", - "execution_count": 303, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
actionactor.idactor.logincomment.author_associationcomment.bodycomment.commit_idcomment.created_atcomment.diff_hunkcomment.idcomment.in_reply_to_id...review.author_associationreview.bodyreview.commit_idreview.idreview.statereview.submitted_attypeuser.loginuser.typegroup_key
0opened1753262mo9a7iNoneNoneNoneNoneNoneNoneNone...NoneNoneNoneNaNNoneNonePullRequestEventNoneNone1.0
1created1753262mo9a7iNoneNoneNoneNoneNoneNoneNone...MEMBERlooks fineba75444d1ada77cf5f3f06cd74b6320bab8db54b962846794.0commented2022-05-05T04:35:02ZPullRequestReviewEventmo9a7iUserba75444d1ada77cf5f3f06cd74b6320bab8db54b
2closed1753262mo9a7iNoneNoneNoneNoneNoneNoneNone...NoneNoneNoneNaNNoneNonePullRequestEventNoneNone2.0
\n", - "

3 rows × 39 columns

\n", - "
" - ], - "text/plain": [ - " action actor.id actor.login comment.author_association comment.body \n", - "0 opened 1753262 mo9a7i None None \\\n", - "1 created 1753262 mo9a7i None None \n", - "2 closed 1753262 mo9a7i None None \n", - "\n", - " comment.commit_id comment.created_at comment.diff_hunk comment.id \n", - "0 None None None None \\\n", - "1 None None None None \n", - "2 None None None None \n", - "\n", - " comment.in_reply_to_id ... review.author_association review.body \n", - "0 None ... None None \\\n", - "1 None ... MEMBER looks fine \n", - "2 None ... None None \n", - "\n", - " review.commit_id review.id review.state \n", - "0 None NaN None \\\n", - "1 ba75444d1ada77cf5f3f06cd74b6320bab8db54b 962846794.0 commented \n", - "2 None NaN None \n", - "\n", - " review.submitted_at type user.login user.type \n", - "0 None PullRequestEvent None None \\\n", - "1 2022-05-05T04:35:02Z PullRequestReviewEvent mo9a7i User \n", - "2 None PullRequestEvent None None \n", - "\n", - " group_key \n", - "0 1.0 \n", - "1 ba75444d1ada77cf5f3f06cd74b6320bab8db54b \n", - "2 2.0 \n", - "\n", - "[3 rows x 39 columns]" - ] - }, - "execution_count": 303, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import numpy as np\n", - "df = pd.DataFrame(events)\n", - "df['created_at'] = pd.to_datetime(df['created_at'])\n", - "df.drop_duplicates(inplace=True)\n", - "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n", - "mask = df['review.commit_id'].isnull()\n", - "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n", - "df.loc[~mask, 'group_key'] = df['review.commit_id']\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 304, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "df = pd.DataFrame(events)\n", - "df['created_at'] = pd.to_datetime(df['created_at'])\n", - "df.drop_duplicates(inplace=True)\n", - "# Create a new 'group_key' column. For non-null 'review.commit_id' values, it's the same value.\n", - "mask = df['review.commit_id'].isnull()\n", - "df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n", - "df.loc[~mask, 'group_key'] = df['review.commit_id']\n", - "\n", - "if len(df) == 1:\n", - " grouped_events = [[df.iloc[0].to_dict()]]\n", - "else:\n", - " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n", - "\n", - "# sort by first event date\n", - "grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 311, - "metadata": {}, - "outputs": [], - "source": [ - "def create_grouped_events(events):\n", - " \"\"\"group events that happened in the same review thread using review.commit_id\"\"\"\n", - " df = pd.DataFrame(events)\n", - " df['created_at'] = pd.to_datetime(df['created_at'])\n", - " df.drop_duplicates(inplace=True)\n", - " # Create a new 'group_key' where rows with NaN 'review.commit_id' get an identical identifier. Otherwise NaN values go in the same group\n", - " mask = df['review.commit_id'].isnull()\n", - " df.loc[mask, 'group_key'] = np.arange(mask.sum()) + 1\n", - " df.loc[~mask, 'group_key'] = df['review.commit_id']\n", - " \n", - " if len(df) == 1:\n", - " grouped_events = [[df.iloc[0].to_dict()]]\n", - " else:\n", - " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby('group_key', dropna=False)]\n", - " \n", - " # sort by first event date\n", - " grouped_events = sorted(grouped_events, key=lambda x: x[0]['created_at'])\n", - " return grouped_events\n", - "\n", - "grouped_events = create_grouped_events(events)" - ] - }, - { - "cell_type": "code", - "execution_count": 312, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len events 2 and len grouped_events 2\n" - ] - } - ], - "source": [ - "print(f\"len events {len(events)} and len grouped_events {len(grouped_events)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 313, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "thread number 0\n", - "thread number 1\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \"\"\n", - " \n", - " \n", - "
Event TypePullRequestEvent
Userpull[bot]
Actionopened
Review StateNone
PR Stateopen, merged: False
Date2022-10-10 10:57:41+00:00
\n", - "
\n", - " \n", - "---------------------------------------------------------------------------------------------------------------------------------------------------------------------
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \"\"\n", - " \n", - " \n", - "
Event TypePullRequestEvent
Userpull[bot]
Actionclosed
Review StateNone
PR Stateclosed, merged: True
Date2022-10-10 11:01:28+00:00
\n", - "
\n", - " \n", - "---------------------------------------------------------------------------------------------------------------------------------------------------------------------
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", - "thread_html = \"\"\n", - "c = 0\n", - "for thread in grouped_events:\n", - " print(f\"thread number {c}\")\n", - " c += 1\n", - " thread_html += '
'\n", - " # Get the first event in the thread as a reference\n", - " first_event = thread[0]\n", - " poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n", - " # Add shared parts of the events only once\n", - " user_type = f\"(type :{first_event['user.type']})\" if first_event['user.type'] else \"\"\n", - " review_state = f\"Review State{first_event['review.state']}\" if first_event['review.state'] else \"\"\n", - " text = f\"\"\"\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " {review_state}\n", - " \n", - " \n", - "
Event Type{first_event['type']}
User{poster_name} {user_type}
Action{first_event['action']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
Date{first_event['created_at']}
\n", - "
\n", - " \"\"\"\n", - " highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n", - " highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n", - "\n", - " text = f\"\"\"\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \"\"\n", - " \n", - " \n", - "
Event Type{first_event['type']}
User{poster_name} {user_type}
Action{first_event['action']}
Review State{first_event['review.state']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
Date{first_event['created_at']}
\n", - "
\n", - " \"\"\"\n", - "\n", - "\n", - " thread_html += text\n", - " thread_html += (\"\\n\" + \"-\"*165)\n", - " # Add the bodies of the comments for each event in the thread\n", - " for event in thread:\n", - " # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n", - " poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n", - " if event['comment.body'] or event[\"issue.comment\"]:\n", - " is_op = original_poster == poster_name\n", - " thread_html += format_body(event['comment.body'], poster_name, is_op)\n", - "\n", - " thread_html += '
'\n", - "\n", - "display(HTML(thread_html))" - ] - }, - { - "cell_type": "code", - "execution_count": 314, - "metadata": {}, - "outputs": [], - "source": [ - "def display_events(sample):\n", - " events = sample[\"events\"]\n", - " grouped_events = create_grouped_events(events)\n", - " original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", - " for thread in grouped_events:\n", - " thread_html = '
'\n", - " # Get the first event in the thread as a reference\n", - " first_event = thread[0]\n", - " poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n", - " # Add shared parts of the events only once\n", - " user_type = f\"(type :{first_event['user.type']})\" if first_event['user.type'] else \"\"\n", - " highlight_action = \"background-color: #FFCFCF;\" if first_event['action'] == 'closed' else \"\"\n", - " highlight_pr_state = \"background-color: #FFCFCF;\" if first_event['pull_request.merged'] else \"\"\n", - " \n", - " text = f\"\"\"\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event Type{first_event['type']}
User{poster_name} {user_type}
Action{first_event['action']}
Review State{first_event['review.state']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
Date{first_event['created_at']}
\n", - "
\n", - " \"\"\"\n", - " print(f\"added first event of teh group\")\n", - " thread_html += text\n", - " \n", - " # Add the bodies of the comments for each event in the thread\n", - " for event in thread:\n", - " # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n", - " poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n", - " if event['comment.body'] or event[\"issue.comment\"]:\n", - " is_op = original_poster == poster_name\n", - " thread_html += format_body(event['comment.body'], poster_name, is_op)\n", - "\n", - " thread_html += '
'\n", - " display(HTML(thread_html))\n", - " if first_event['comment.path']:\n", - " path_html = f\"Path: {first_event['comment.path']}\"\n", - " display(HTML(path_html))\n", - " display(HTML(\"---\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 316, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[{'action': 'opened',\n", - " 'actor.id': 39814207,\n", - " 'actor.login': 'pull[bot]',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': Timestamp('2022-10-10 10:57:41+0000', tz='UTC'),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None,\n", - " 'group_key': 1.0}],\n", - " [{'action': 'closed',\n", - " 'actor.id': 39814207,\n", - " 'actor.login': 'pull[bot]',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': Timestamp('2022-10-10 11:01:28+0000', tz='UTC'),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.merged_by.login': 'pull[bot]',\n", - " 'pull_request.merged_by.type': 'Bot',\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None,\n", - " 'group_key': 2.0}]]" - ] - }, - "execution_count": 316, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "grouped_events" - ] - }, - { - "cell_type": "code", - "execution_count": 315, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event TypePullRequestEvent
Userpull[bot]
Actionopened
Review StateNone
PR Stateopen, merged: False
Date2022-10-10 10:57:41+00:00
\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "---" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event TypePullRequestEvent
Userpull[bot]
Actionopened
Review StateNone
PR Stateopen, merged: False
Date2022-10-10 10:57:41+00:00
\n", - "
\n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event TypePullRequestEvent
Userpull[bot]
Actionclosed
Review StateNone
PR Stateclosed, merged: True
Date2022-10-10 11:01:28+00:00
\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "---" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "display_events(sample)" - ] - }, - { - "cell_type": "code", - "execution_count": 261, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "single\n", - "2022-05-05 04:35:02+00:00\n", - "with review state: commented\n", - "None\n", - "User: mo9a7i, action: created\n", - "PullRequestReviewEvent\n", - "------------\n", - "multiple\n", - "single\n", - "2022-05-05 04:35:02+00:00\n", - "with review state: None\n", - "None\n", - "User: mo9a7i, action: opened\n", - "PullRequestEvent\n", - "------------\n", - "------------\n", - "2022-05-05 04:35:02+00:00\n", - "with review state: None PR state False\n", - "None\n", - "User: mo9a7i, action: closed\n", - "PullRequestEvent\n", - "------------\n", - "------------end multiple\n" - ] - } - ], - "source": [ - "for group in grouped_events:\n", - " if len(group) == 1:\n", - " poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n", - " print(\"single\")\n", - " print(group[0][\"created_at\"])\n", - " print(f\"with review state: {group[0]['review.state']}\")\n", - " print(group[0][\"comment.body\"])\n", - " # print action type and user\n", - " print(f\"User: {poster_name}, action: {group[0]['action']}\")\n", - " print(group[0][\"type\"])\n", - " print(\"------------\")\n", - " continue\n", - " # date \n", - " else:\n", - " print(\"multiple\")\n", - " poster_name = group[0]['actor.login'] or group[0]['issue.author'] or group[0]['user.login']\n", - " print(\"single\")\n", - " print(group[0][\"created_at\"])\n", - " print(f\"with review state: {group[0]['review.state']}\")\n", - " print(group[0][\"comment.body\"])\n", - " # print action type and user\n", - " print(f\"User: {poster_name}, action: {group[0]['action']}\")\n", - " print(group[0][\"type\"])\n", - " print(\"------------\")\n", - " print(\"------------\")\n", - " for e in group[1:]:\n", - " print(group[0][\"created_at\"])\n", - " print(f\"with review state: {group[0]['review.state']} PR state {group[0]['pull_request.merged']}\")\n", - " print(e[\"comment.body\"])\n", - " poster_name = e['actor.login'] or e['issue.author'] or e['user.login']\n", - " print(f\"User: {poster_name}, action: {e['action']}\")\n", - " print(e[\"type\"])\n", - " print(\"------------\")\n", - " print(\"------------end multiple\")" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": {}, - "outputs": [], - "source": [ - "def create_grouped_events(events):\n", - " df = pd.DataFrame(events)\n", - " df['created_at'] = pd.to_datetime(df['created_at'])\n", - " df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n", - " # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n", - " if len(df) == 1:\n", - " grouped_events = [[df.iloc[0].to_dict()]]\n", - " else:\n", - " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id', 'pull_request.state'], dropna=False)]\n", - " return grouped_events\n", - "\n", - "def format_body(text, user, is_op=False):\n", - " color = \"#007bff\" if is_op else \"black\"\n", - " pr_body = f\"
👤{user}: {text}
\"\n", - " return pr_body" - ] - }, - { - "cell_type": "code", - "execution_count": 220, - "metadata": {}, - "outputs": [], - "source": [ - "import uuid\n", - "import pandas as pd\n", - "\n", - "def create_grouped_events(events):\n", - " df = pd.DataFrame(events)\n", - " \n", - " # Ensure it's in datetime format\n", - " df['created_at'] = pd.to_datetime(df['created_at'])\n", - " # Preserve the original order\n", - " df['order'] = range(len(df))\n", - "\n", - " # Create a new column 'uuid' initialized with None\n", - " df['uuid'] = None\n", - "\n", - " # For rows where either 'comment.diff_hunk' or 'comment.commit_id' is NaN, assign a unique UUID\n", - " mask = df['comment.diff_hunk'].isna() | df['comment.commit_id'].isna()\n", - " df.loc[mask, 'uuid'] = [str(uuid.uuid4()) for _ in range(mask.sum())]\n", - "\n", - " # Group by 'comment.diff_hunk', 'comment.commit_id', and 'uuid'\n", - " grouped_events = [group.drop(columns=['uuid', 'order']).to_dict(orient='records') \n", - " for _, group in df.sort_values(by='order').groupby(['comment.diff_hunk', 'comment.commit_id', 'uuid'], dropna=False)]\n", - " # soert on created_at\n", - " grouped_events = [sorted(group, key=lambda x: x['created_at']) for group in grouped_events]\n", - " return grouped_events\n", - "\n", - "\n", - "\n", - "grouped_events = create_grouped_events(events)\n", - "c = 0\n", - "thread_html = \"\"\n", - "for thread in grouped_events:\n", - " # Start a new thread\n", - " #print(thread)\n", - " if thread[0][\"action\"] == \"opened\":\n", - " continue\n", - " thread_html += '
'\n", - " # Get the first event in the thread as a reference\n", - " first_event = thread[0]\n", - " poster_name = first_event['actor.login'] or first_event['issue.author'] or first_event['user.login']\n", - " # Add shared parts of the events only once\n", - " text = f\"\"\"\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event Type{first_event['type']}
User{poster_name} (type :{first_event['user.type']})
Action{first_event['action']}
Review State{first_event['review.state']}
PR State{first_event[\"pull_request.state\"]}, merged: {first_event['pull_request.merged']}
From Head{sample[\"head_repo_info\"]['pull_request.head.label']}
\n", - "
\n", - " \"\"\"\n", - " thread_html += text\n", - " # add horizontal line\n", - " thread_html += '
'\n", - " for event in thread:\n", - " # from 'actor.login' and 'issue.author' and 'user.login' take which ever isn't none\n", - " poster_name = event['actor.login'] or event['issue.author'] or event['user.login']\n", - " if event['comment.body'] or event[\"issue.comment\"]:\n", - " is_op = original_poster == poster_name\n", - " thread_html += format_body(event['comment.body'], poster_name, is_op)\n", - "\n", - " thread_html += '
'" - ] - }, - { - "cell_type": "code", - "execution_count": 218, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4" - ] - }, - "execution_count": 218, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(grouped_events)" - ] - }, - { - "cell_type": "code", - "execution_count": 221, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[{'action': 'created',\n", - " 'actor.id': nan,\n", - " 'actor.login': None,\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': Timestamp('2022-02-01 00:05:19+0000', tz='UTC'),\n", - " 'issue.author': 'kasobol-msft',\n", - " 'issue.comment': \"This won't work well because it includes dependencies in \"\n", - " 'output like this:\\r\\n'\n", - " '![image](https://user-images.githubusercontent.com/61715331/151893024-ef3e99d9-0d83-44c6-839b-966550320642.png)\\r\\n'\n", - " '\\r\\n'\n", - " \"There's hacky way to side step this:\\r\\n\"\n", - " '![image](https://user-images.githubusercontent.com/61715331/151893056-8d018cb9-2f0d-4c7d-8848-eb9df9028b88.png)\\r\\n'\n", - " '\\r\\n'\n", - " 'But it would require be explicit about each dependency in '\n", - " 'each sdk to be precise and not risk any \"dependency doc '\n", - " 'leaks\".',\n", - " 'issue.comment_id': 1026335328.0,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': None,\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'comment',\n", - " 'user.login': None,\n", - " 'user.type': None}],\n", - " [{'action': 'opened',\n", - " 'actor.id': 61715331.0,\n", - " 'actor.login': 'kasobol-msft',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': Timestamp('2022-01-31 22:51:21+0000', tz='UTC'),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': nan,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}],\n", - " [{'action': 'opened',\n", - " 'actor.id': nan,\n", - " 'actor.login': None,\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': Timestamp('2022-01-31 22:51:20+0000', tz='UTC'),\n", - " 'issue.author': 'kasobol-msft',\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': nan,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': None,\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'issue',\n", - " 'user.login': None,\n", - " 'user.type': None}],\n", - " [{'action': 'closed',\n", - " 'actor.id': 61715331.0,\n", - " 'actor.login': 'kasobol-msft',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': Timestamp('2022-02-01 00:05:20+0000', tz='UTC'),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': nan,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}]]\n" - ] - } - ], - "source": [ - "pprint(grouped_events)" - ] - }, - { - "cell_type": "code", - "execution_count": 193, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event TypePullRequestEvent
Userkasobol-msft (type :None)
Actionclosed
Review StateNone
PR Stateclosed, merged: False
From HeadAzure:kasobol-msft-patch-1
\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from IPython.display import HTML, display\n", - "display(HTML(thread_html))" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.description': None,\n", - " 'pull_request.base.repo.forks_count': 0,\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.language': 'TypeScript',\n", - " 'pull_request.base.repo.license.name': None,\n", - " 'pull_request.base.repo.name': 'WuzuufMasr',\n", - " 'pull_request.base.repo.open_issues_count': 24,\n", - " 'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n", - " 'pull_request.base.repo.owner.type': 'User',\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.base.repo.stargazers_count': 0,\n", - " 'pull_request.base.repo.watchers_count': 0,\n", - " 'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n", - " 'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n", - " 'pull_request.base.user.type': 'User',\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.label.name': None,\n", - " 'pull_request.review_comments': 0}" - ] - }, - "execution_count": 92, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "merged_ds[0][\"base_repo_info\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [], - "source": [ - "ds = merged_ds" - ] - }, - { - "cell_type": "code", - "execution_count": 321, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'__index_level_0__': 175,\n", - " 'bucket': '940',\n", - " 'pull_request.code_review_events': None,\n", - " 'pull_request.events': '[{\"type\": \"PullRequestEvent\", \"action\": \"opened\", '\n", - " '\"actor.login\": \"pkarman\", \"actor.id\": 1205061, '\n", - " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", - " 'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n", - " '\"public\": true, \"created_at\": \"2015-11-23T19:16:36Z\", '\n", - " '\"org.id\": 6233994, \"org.login\": \"18F\", '\n", - " '\"pull_request.id\": 51566831, \"pull_request.number\": '\n", - " '820, \"pull_request.state\": \"open\", '\n", - " '\"pull_request.title\": \"rename elk services to '\n", - " 'workaround blue-green deploy bug\", '\n", - " '\"pull_request.body\": \"there\\'s a bug in the '\n", - " 'cf-blue-green deploy that gets a false positive match '\n", - " 'based on the current ELK naming convention. I have '\n", - " 're-named all our ELK services to workaround that '\n", - " 'bug.\", \"pull_request.user.login\": \"pkarman\", '\n", - " '\"pull_request.user.id\": 1205061, '\n", - " '\"pull_request.author_association\": null, '\n", - " '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n", - " '\"pull_request.updated_at\": \"2015-11-23T19:16:34Z\", '\n", - " '\"pull_request.closed_at\": null, '\n", - " '\"pull_request.merged_at\": null, '\n", - " '\"pull_request.merge_commit_sha\": '\n", - " '\"4b1557970247cde19eb3ea3992c324174d49a3d7\", '\n", - " '\"pull_request.locked\": false, '\n", - " '\"pull_request.assignee.login\": null, '\n", - " '\"pull_request.assignee.id\": null, '\n", - " '\"pull_request.assignee.type\": null, '\n", - " '\"pull_request.assignee.site_admin\": null, '\n", - " '\"pull_request.milestone.id\": null, '\n", - " '\"pull_request.milestone.number\": null, '\n", - " '\"pull_request.milestone.title\": null, '\n", - " '\"pull_request.milestone.description\": null, '\n", - " '\"pull_request.milestone.creator.login\": null, '\n", - " '\"pull_request.milestone.creator.id\": null, '\n", - " '\"pull_request.milestone.creator.type\": null, '\n", - " '\"pull_request.milestone.creator.site_admin\": null, '\n", - " '\"pull_request.milestone.open_issues\": null, '\n", - " '\"pull_request.milestone.closed_issues\": null, '\n", - " '\"pull_request.milestone.state\": null, '\n", - " '\"pull_request.milestone.created_at\": null, '\n", - " '\"pull_request.milestone.updated_at\": null, '\n", - " '\"pull_request.milestone.due_on\": null, '\n", - " '\"pull_request.milestone.closed_at\": null, '\n", - " '\"pull_request.merged\": false, '\n", - " '\"pull_request.mergeable\": true, '\n", - " '\"pull_request.mergeable_state\": \"clean\", '\n", - " '\"pull_request.merged_by.login\": null, '\n", - " '\"pull_request.merged_by.id\": null, '\n", - " '\"pull_request.merged_by.type\": null, '\n", - " '\"pull_request.merged_by.site_admin\": null, '\n", - " '\"pull_request.comments\": 0, '\n", - " '\"pull_request.review_comments\": 0, '\n", - " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", - " '3, \"pull_request.deletions\": 3, '\n", - " '\"pull_request.changed_files\": 1, '\n", - " '\"pull_request.label.id\": null, '\n", - " '\"pull_request.label.name\": null, '\n", - " '\"pull_request.label.color\": null, '\n", - " '\"pull_request.label.default\": null, '\n", - " '\"pull_request.head.label\": \"18F:elk-rename\", '\n", - " '\"pull_request.head.ref\": \"elk-rename\", '\n", - " '\"pull_request.head.sha\": '\n", - " '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n", - " '\"pull_request.head.user.login\": \"18F\", '\n", - " '\"pull_request.head.user.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.name\": \"C2\", '\n", - " '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n", - " '\"pull_request.head.repo.owner.login\": \"18F\", '\n", - " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.private\": false, '\n", - " '\"pull_request.head.repo.homepage\": '\n", - " '\"https://cap.18f.gov\", '\n", - " '\"pull_request.head.repo.description\": \"an approval '\n", - " 'process automation tool\", '\n", - " '\"pull_request.head.repo.fork\": false, '\n", - " '\"pull_request.head.repo.created_at\": '\n", - " '\"2014-03-28T05:15:23Z\", '\n", - " '\"pull_request.head.repo.updated_at\": '\n", - " '\"2015-11-06T02:16:44Z\", '\n", - " '\"pull_request.head.repo.pushed_at\": '\n", - " '\"2015-11-23T19:16:35Z\", '\n", - " '\"pull_request.head.repo.size\": 81432, '\n", - " '\"pull_request.head.repo.stargazers_count\": 31, '\n", - " '\"pull_request.head.repo.watchers_count\": 31, '\n", - " '\"pull_request.head.repo.language\": \"Ruby\", '\n", - " '\"pull_request.head.repo.has_issues\": true, '\n", - " '\"pull_request.head.repo.has_projects\": null, '\n", - " '\"pull_request.head.repo.has_downloads\": true, '\n", - " '\"pull_request.head.repo.has_wiki\": false, '\n", - " '\"pull_request.head.repo.has_pages\": false, '\n", - " '\"pull_request.head.repo.forks_count\": 16, '\n", - " '\"pull_request.head.repo.archived\": null, '\n", - " '\"pull_request.head.repo.disabled\": null, '\n", - " '\"pull_request.head.repo.open_issues_count\": 6, '\n", - " '\"pull_request.head.repo.forks\": 16, '\n", - " '\"pull_request.head.repo.open_issues\": 6, '\n", - " '\"pull_request.head.repo.watchers\": 31, '\n", - " '\"pull_request.head.repo.default_branch\": \"master\", '\n", - " '\"pull_request.head.repo.license.key\": null, '\n", - " '\"pull_request.head.repo.license.spdx_id\": null, '\n", - " '\"pull_request.head.repo.license.name\": null, '\n", - " '\"pull_request.base.label\": \"18F:master\", '\n", - " '\"pull_request.base.ref\": \"master\", '\n", - " '\"pull_request.base.sha\": '\n", - " '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n", - " '\"pull_request.base.user.login\": \"18F\", '\n", - " '\"pull_request.base.user.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.name\": \"C2\", '\n", - " '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n", - " '\"pull_request.base.repo.owner.login\": \"18F\", '\n", - " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.private\": false, '\n", - " '\"pull_request.base.repo.homepage\": '\n", - " '\"https://cap.18f.gov\", '\n", - " '\"pull_request.base.repo.description\": \"an approval '\n", - " 'process automation tool\", '\n", - " '\"pull_request.base.repo.fork\": false, '\n", - " '\"pull_request.base.repo.created_at\": '\n", - " '\"2014-03-28T05:15:23Z\", '\n", - " '\"pull_request.base.repo.updated_at\": '\n", - " '\"2015-11-06T02:16:44Z\", '\n", - " '\"pull_request.base.repo.pushed_at\": '\n", - " '\"2015-11-23T19:16:35Z\", '\n", - " '\"pull_request.base.repo.size\": 81432, '\n", - " '\"pull_request.base.repo.stargazers_count\": 31, '\n", - " '\"pull_request.base.repo.watchers_count\": 31, '\n", - " '\"pull_request.base.repo.language\": \"Ruby\", '\n", - " '\"pull_request.base.repo.has_issues\": true, '\n", - " '\"pull_request.base.repo.has_projects\": null, '\n", - " '\"pull_request.base.repo.has_downloads\": true, '\n", - " '\"pull_request.base.repo.has_wiki\": false, '\n", - " '\"pull_request.base.repo.has_pages\": false, '\n", - " '\"pull_request.base.repo.forks_count\": 16, '\n", - " '\"pull_request.base.repo.archived\": null, '\n", - " '\"pull_request.base.repo.disabled\": null, '\n", - " '\"pull_request.base.repo.open_issues_count\": 6, '\n", - " '\"pull_request.base.repo.forks\": 16, '\n", - " '\"pull_request.base.repo.open_issues\": 6, '\n", - " '\"pull_request.base.repo.watchers\": 31, '\n", - " '\"pull_request.base.repo.default_branch\": \"master\", '\n", - " '\"pull_request.base.repo.license.key\": null, '\n", - " '\"pull_request.base.repo.license.spdx_id\": null, '\n", - " '\"pull_request.base.repo.license.name\": null, '\n", - " '\"pull_request.guid\": \"18F/C2/pull/820\"}, {\"type\": '\n", - " '\"PullRequestEvent\", \"action\": \"closed\", '\n", - " '\"actor.login\": \"jessieay\", \"actor.id\": 601515, '\n", - " '\"user.login\": null, \"user.id\": null, \"user.type\": '\n", - " 'null, \"repo.name\": \"18F/C2\", \"repo.id\": 18201810, '\n", - " '\"public\": true, \"created_at\": \"2015-11-23T22:09:46Z\", '\n", - " '\"org.id\": 6233994, \"org.login\": \"18F\", '\n", - " '\"pull_request.id\": 51566831, \"pull_request.number\": '\n", - " '820, \"pull_request.state\": \"closed\", '\n", - " '\"pull_request.title\": \"rename elk services to '\n", - " 'workaround blue-green deploy bug\", '\n", - " '\"pull_request.body\": \"there\\'s a bug in the '\n", - " 'cf-blue-green deploy that gets a false positive match '\n", - " 'based on the current ELK naming convention. I have '\n", - " 're-named all our ELK services to workaround that '\n", - " 'bug.\", \"pull_request.user.login\": \"pkarman\", '\n", - " '\"pull_request.user.id\": 1205061, '\n", - " '\"pull_request.author_association\": null, '\n", - " '\"pull_request.created_at\": \"2015-11-23T19:16:34Z\", '\n", - " '\"pull_request.updated_at\": \"2015-11-23T22:09:45Z\", '\n", - " '\"pull_request.closed_at\": \"2015-11-23T22:09:45Z\", '\n", - " '\"pull_request.merged_at\": \"2015-11-23T22:09:45Z\", '\n", - " '\"pull_request.merge_commit_sha\": '\n", - " '\"6d3c30d429a49321552973b81e1ef4cd3073157f\", '\n", - " '\"pull_request.locked\": false, '\n", - " '\"pull_request.assignee.login\": null, '\n", - " '\"pull_request.assignee.id\": null, '\n", - " '\"pull_request.assignee.type\": null, '\n", - " '\"pull_request.assignee.site_admin\": null, '\n", - " '\"pull_request.milestone.id\": null, '\n", - " '\"pull_request.milestone.number\": null, '\n", - " '\"pull_request.milestone.title\": null, '\n", - " '\"pull_request.milestone.description\": null, '\n", - " '\"pull_request.milestone.creator.login\": null, '\n", - " '\"pull_request.milestone.creator.id\": null, '\n", - " '\"pull_request.milestone.creator.type\": null, '\n", - " '\"pull_request.milestone.creator.site_admin\": null, '\n", - " '\"pull_request.milestone.open_issues\": null, '\n", - " '\"pull_request.milestone.closed_issues\": null, '\n", - " '\"pull_request.milestone.state\": null, '\n", - " '\"pull_request.milestone.created_at\": null, '\n", - " '\"pull_request.milestone.updated_at\": null, '\n", - " '\"pull_request.milestone.due_on\": null, '\n", - " '\"pull_request.milestone.closed_at\": null, '\n", - " '\"pull_request.merged\": true, '\n", - " '\"pull_request.mergeable\": null, '\n", - " '\"pull_request.mergeable_state\": \"unknown\", '\n", - " '\"pull_request.merged_by.login\": \"jessieay\", '\n", - " '\"pull_request.merged_by.id\": 601515, '\n", - " '\"pull_request.merged_by.type\": \"User\", '\n", - " '\"pull_request.merged_by.site_admin\": false, '\n", - " '\"pull_request.comments\": 1, '\n", - " '\"pull_request.review_comments\": 0, '\n", - " '\"pull_request.commits\": 1, \"pull_request.additions\": '\n", - " '3, \"pull_request.deletions\": 3, '\n", - " '\"pull_request.changed_files\": 1, '\n", - " '\"pull_request.label.id\": null, '\n", - " '\"pull_request.label.name\": null, '\n", - " '\"pull_request.label.color\": null, '\n", - " '\"pull_request.label.default\": null, '\n", - " '\"pull_request.head.label\": \"18F:elk-rename\", '\n", - " '\"pull_request.head.ref\": \"elk-rename\", '\n", - " '\"pull_request.head.sha\": '\n", - " '\"8a8321be4e8eff669e3d3406393b875bf56684c3\", '\n", - " '\"pull_request.head.user.login\": \"18F\", '\n", - " '\"pull_request.head.user.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.name\": \"C2\", '\n", - " '\"pull_request.head.repo.full_name\": \"18F/C2\", '\n", - " '\"pull_request.head.repo.owner.login\": \"18F\", '\n", - " '\"pull_request.head.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.head.repo.private\": false, '\n", - " '\"pull_request.head.repo.homepage\": '\n", - " '\"https://cap.18f.gov\", '\n", - " '\"pull_request.head.repo.description\": \"an approval '\n", - " 'process automation tool\", '\n", - " '\"pull_request.head.repo.fork\": false, '\n", - " '\"pull_request.head.repo.created_at\": '\n", - " '\"2014-03-28T05:15:23Z\", '\n", - " '\"pull_request.head.repo.updated_at\": '\n", - " '\"2015-11-06T02:16:44Z\", '\n", - " '\"pull_request.head.repo.pushed_at\": '\n", - " '\"2015-11-23T22:09:45Z\", '\n", - " '\"pull_request.head.repo.size\": 81440, '\n", - " '\"pull_request.head.repo.stargazers_count\": 31, '\n", - " '\"pull_request.head.repo.watchers_count\": 31, '\n", - " '\"pull_request.head.repo.language\": \"Ruby\", '\n", - " '\"pull_request.head.repo.has_issues\": true, '\n", - " '\"pull_request.head.repo.has_projects\": null, '\n", - " '\"pull_request.head.repo.has_downloads\": true, '\n", - " '\"pull_request.head.repo.has_wiki\": false, '\n", - " '\"pull_request.head.repo.has_pages\": false, '\n", - " '\"pull_request.head.repo.forks_count\": 16, '\n", - " '\"pull_request.head.repo.archived\": null, '\n", - " '\"pull_request.head.repo.disabled\": null, '\n", - " '\"pull_request.head.repo.open_issues_count\": 4, '\n", - " '\"pull_request.head.repo.forks\": 16, '\n", - " '\"pull_request.head.repo.open_issues\": 4, '\n", - " '\"pull_request.head.repo.watchers\": 31, '\n", - " '\"pull_request.head.repo.default_branch\": \"master\", '\n", - " '\"pull_request.head.repo.license.key\": null, '\n", - " '\"pull_request.head.repo.license.spdx_id\": null, '\n", - " '\"pull_request.head.repo.license.name\": null, '\n", - " '\"pull_request.base.label\": \"18F:master\", '\n", - " '\"pull_request.base.ref\": \"master\", '\n", - " '\"pull_request.base.sha\": '\n", - " '\"5dc2669048311777bf472e824c1a6f865eaccc67\", '\n", - " '\"pull_request.base.user.login\": \"18F\", '\n", - " '\"pull_request.base.user.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.name\": \"C2\", '\n", - " '\"pull_request.base.repo.full_name\": \"18F/C2\", '\n", - " '\"pull_request.base.repo.owner.login\": \"18F\", '\n", - " '\"pull_request.base.repo.owner.type\": \"Organization\", '\n", - " '\"pull_request.base.repo.private\": false, '\n", - " '\"pull_request.base.repo.homepage\": '\n", - " '\"https://cap.18f.gov\", '\n", - " '\"pull_request.base.repo.description\": \"an approval '\n", - " 'process automation tool\", '\n", - " '\"pull_request.base.repo.fork\": false, '\n", - " '\"pull_request.base.repo.created_at\": '\n", - " '\"2014-03-28T05:15:23Z\", '\n", - " '\"pull_request.base.repo.updated_at\": '\n", - " '\"2015-11-06T02:16:44Z\", '\n", - " '\"pull_request.base.repo.pushed_at\": '\n", - " '\"2015-11-23T22:09:45Z\", '\n", - " '\"pull_request.base.repo.size\": 81440, '\n", - " '\"pull_request.base.repo.stargazers_count\": 31, '\n", - " '\"pull_request.base.repo.watchers_count\": 31, '\n", - " '\"pull_request.base.repo.language\": \"Ruby\", '\n", - " '\"pull_request.base.repo.has_issues\": true, '\n", - " '\"pull_request.base.repo.has_projects\": null, '\n", - " '\"pull_request.base.repo.has_downloads\": true, '\n", - " '\"pull_request.base.repo.has_wiki\": false, '\n", - " '\"pull_request.base.repo.has_pages\": false, '\n", - " '\"pull_request.base.repo.forks_count\": 16, '\n", - " '\"pull_request.base.repo.archived\": null, '\n", - " '\"pull_request.base.repo.disabled\": null, '\n", - " '\"pull_request.base.repo.open_issues_count\": 4, '\n", - " '\"pull_request.base.repo.forks\": 16, '\n", - " '\"pull_request.base.repo.open_issues\": 4, '\n", - " '\"pull_request.base.repo.watchers\": 31, '\n", - " '\"pull_request.base.repo.default_branch\": \"master\", '\n", - " '\"pull_request.base.repo.license.key\": null, '\n", - " '\"pull_request.base.repo.license.spdx_id\": null, '\n", - " '\"pull_request.base.repo.license.name\": null, '\n", - " '\"pull_request.guid\": \"18F/C2/pull/820\"}]',\n", - " 'pull_request.guid': '18F/C2/pull/820',\n", - " 'pull_request.issue_events': '{\"repo\": \"18F/C2\", \"org\": \"18F\", \"issue_id\": '\n", - " '118451607, \"issue_number\": 820, \"pull_request\": '\n", - " '{\"number\": 820.0, \"repo\": \"C2\", \"user_login\": '\n", - " '\"18F\"}, \"events\": [{\"action\": \"opened\", '\n", - " '\"author\": \"pkarman\", \"comment\": null, '\n", - " '\"comment_id\": null, \"datetime\": '\n", - " '\"2015-11-23T19:16:34Z\", \"description\": '\n", - " '\"there\\'s a bug in the cf-blue-green deploy '\n", - " 'that gets a false positive match based on the '\n", - " 'current ELK naming convention. I have re-named '\n", - " 'all our ELK services to workaround that bug.\", '\n", - " '\"title\": \"rename elk services to workaround '\n", - " 'blue-green deploy bug\", \"type\": \"issue\"}, '\n", - " '{\"action\": \"created\", \"author\": \"jessieay\", '\n", - " '\"comment\": \"wish there were a good way to write '\n", - " 'tests for this type of thing...\\\\r\\\\n\\\\r\\\\nbut '\n", - " 'LGTM. merging. \", \"comment_id\": 159082113.0, '\n", - " '\"datetime\": \"2015-11-23 22:09:43+00:00\", '\n", - " '\"description\": null, \"title\": null, \"type\": '\n", - " '\"comment\"}]}'}\n" - ] - } - ], - "source": [ - "pprint(small_ds[8])" - ] - }, - { - "cell_type": "code", - "execution_count": 327, - "metadata": {}, - "outputs": [], - "source": [ - "actions = []\n", - "c = 0\n", - "for events in ds[\"events\"]:\n", - " c += 1\n", - " actions.extend([event[\"action\"] for event in events])\n", - " if c > 10000:\n", - " break\n" - ] - }, - { - "cell_type": "code", - "execution_count": 328, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'closed', 'created', 'opened', 'reopened'}" - ] - }, - "execution_count": 328, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "set(actions)" - ] - }, - { - "cell_type": "code", - "execution_count": 322, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'base_repo_info': {'pull_request.base.label': '1011X:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.description': 'Representing '\n", - " 'rational numbers '\n", - " 'using the '\n", - " 'floating-bar number '\n", - " 'type.',\n", - " 'pull_request.base.repo.forks_count': 2,\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.language': 'Rust',\n", - " 'pull_request.base.repo.license.name': 'Other',\n", - " 'pull_request.base.repo.name': 'floating_bar',\n", - " 'pull_request.base.repo.open_issues_count': 6,\n", - " 'pull_request.base.repo.owner.login': '1011X',\n", - " 'pull_request.base.repo.owner.type': 'User',\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.base.repo.stargazers_count': 15,\n", - " 'pull_request.base.repo.watchers_count': 15,\n", - " 'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n", - " 'pull_request.base.user.login': '1011X',\n", - " 'pull_request.base.user.type': 'User',\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.label.name': None,\n", - " 'pull_request.review_comments': 0},\n", - " 'bucket': '940',\n", - " 'events': [{'action': 'opened',\n", - " 'actor.id': None,\n", - " 'actor.login': None,\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=),\n", - " 'issue.author': 'ZoeyR',\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': None,\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'issue',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'opened',\n", - " 'actor.id': 8010244,\n", - " 'actor.login': 'ZoeyR',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'created',\n", - " 'actor.id': None,\n", - " 'actor.login': None,\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=),\n", - " 'issue.author': '1011X',\n", - " 'issue.comment': 'LGTM, thank you!',\n", - " 'issue.comment_id': 835503633.0,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': None,\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'comment',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'closed',\n", - " 'actor.id': 1851619,\n", - " 'actor.login': '1011X',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.merged_by.login': '1011X',\n", - " 'pull_request.merged_by.type': 'User',\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}],\n", - " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n", - " 'pull_request.head.ref': 'fractional-benches',\n", - " 'pull_request.head.repo.default_branch': 'master',\n", - " 'pull_request.head.repo.description': 'Representing '\n", - " 'rational numbers '\n", - " 'using the '\n", - " 'floating-bar number '\n", - " 'type.',\n", - " 'pull_request.head.repo.homepage': None,\n", - " 'pull_request.head.repo.language': None,\n", - " 'pull_request.head.repo.license.name': 'Other',\n", - " 'pull_request.head.repo.name': 'floating_bar',\n", - " 'pull_request.head.repo.owner.login': 'ZoeyR',\n", - " 'pull_request.head.repo.owner.type': 'User',\n", - " 'pull_request.head.repo.private': False,\n", - " 'pull_request.head.repo.stargazers_count': 0,\n", - " 'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n", - " 'pull_request.head.user.login': 'ZoeyR',\n", - " 'pull_request.head.user.type': 'User'},\n", - " 'pull_request_info': {'org.id': None,\n", - " 'public': True,\n", - " 'pull_request.additions': 23,\n", - " 'pull_request.base.user.type': 'User',\n", - " 'pull_request.body': '',\n", - " 'pull_request.changed_files': 4,\n", - " 'pull_request.closed_at': None,\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.commits': 1,\n", - " 'pull_request.created_at': '2021-05-08T20:30:31Z',\n", - " 'pull_request.deletions': 19,\n", - " 'pull_request.guid': '1011X/floating_bar/pull/7',\n", - " 'pull_request.head.user.type': 'User',\n", - " 'pull_request.id': 634875503,\n", - " 'pull_request.merged_at': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.milestone.description': None,\n", - " 'pull_request.milestone.number': None,\n", - " 'pull_request.milestone.title': None,\n", - " 'pull_request.number': 7,\n", - " 'pull_request.review_comments': 0,\n", - " 'pull_request.state': 'open',\n", - " 'pull_request.title': 'change benches to use fractional '\n", - " 'values',\n", - " 'pull_request.user.id': 8010244,\n", - " 'pull_request.user.login': 'ZoeyR',\n", - " 'repo.id': 166723951,\n", - " 'repo.name': '1011X/floating_bar'}}\n" - ] - } - ], - "source": [ - "pprint(ds[6])" - ] - }, - { - "cell_type": "code", - "execution_count": 318, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'bucket': '940',\n", - " 'pull_request_info': {'org.id': None,\n", - " 'public': True,\n", - " 'pull_request.additions': 23,\n", - " 'pull_request.base.user.type': 'User',\n", - " 'pull_request.body': '',\n", - " 'pull_request.changed_files': 4,\n", - " 'pull_request.closed_at': None,\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.commits': 1,\n", - " 'pull_request.created_at': '2021-05-08T20:30:31Z',\n", - " 'pull_request.deletions': 19,\n", - " 'pull_request.guid': '1011X/floating_bar/pull/7',\n", - " 'pull_request.head.user.type': 'User',\n", - " 'pull_request.id': 634875503,\n", - " 'pull_request.merged_at': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.milestone.description': None,\n", - " 'pull_request.milestone.number': None,\n", - " 'pull_request.milestone.title': None,\n", - " 'pull_request.number': 7,\n", - " 'pull_request.review_comments': 0,\n", - " 'pull_request.state': 'open',\n", - " 'pull_request.title': 'change benches to use fractional values',\n", - " 'pull_request.user.id': 8010244,\n", - " 'pull_request.user.login': 'ZoeyR',\n", - " 'repo.id': 166723951,\n", - " 'repo.name': '1011X/floating_bar'},\n", - " 'head_repo_info': {'pull_request.head.label': 'ZoeyR:fractional-benches',\n", - " 'pull_request.head.ref': 'fractional-benches',\n", - " 'pull_request.head.repo.default_branch': 'master',\n", - " 'pull_request.head.repo.description': 'Representing rational numbers using the floating-bar number type.',\n", - " 'pull_request.head.repo.homepage': None,\n", - " 'pull_request.head.repo.language': None,\n", - " 'pull_request.head.repo.license.name': 'Other',\n", - " 'pull_request.head.repo.name': 'floating_bar',\n", - " 'pull_request.head.repo.owner.login': 'ZoeyR',\n", - " 'pull_request.head.repo.owner.type': 'User',\n", - " 'pull_request.head.repo.private': False,\n", - " 'pull_request.head.repo.stargazers_count': 0,\n", - " 'pull_request.head.sha': '742df616b7ea2cb927d5247ec69b91e6c6d8cbdd',\n", - " 'pull_request.head.user.login': 'ZoeyR',\n", - " 'pull_request.head.user.type': 'User'},\n", - " 'base_repo_info': {'pull_request.base.label': '1011X:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.description': 'Representing rational numbers using the floating-bar number type.',\n", - " 'pull_request.base.repo.forks_count': 2,\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.language': 'Rust',\n", - " 'pull_request.base.repo.license.name': 'Other',\n", - " 'pull_request.base.repo.name': 'floating_bar',\n", - " 'pull_request.base.repo.open_issues_count': 6,\n", - " 'pull_request.base.repo.owner.login': '1011X',\n", - " 'pull_request.base.repo.owner.type': 'User',\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.base.repo.stargazers_count': 15,\n", - " 'pull_request.base.repo.watchers_count': 15,\n", - " 'pull_request.base.sha': '27ee250ef208e11aa36dc77022b0f8a58e965dba',\n", - " 'pull_request.base.user.login': '1011X',\n", - " 'pull_request.base.user.type': 'User',\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.label.name': None,\n", - " 'pull_request.review_comments': 0},\n", - " 'events': [{'action': 'opened',\n", - " 'actor.id': None,\n", - " 'actor.login': None,\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 31, tzinfo=),\n", - " 'issue.author': 'ZoeyR',\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': None,\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'issue',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'opened',\n", - " 'actor.id': 8010244,\n", - " 'actor.login': 'ZoeyR',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 30, 32, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': 'open',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'created',\n", - " 'actor.id': None,\n", - " 'actor.login': None,\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 27, tzinfo=),\n", - " 'issue.author': '1011X',\n", - " 'issue.comment': 'LGTM, thank you!',\n", - " 'issue.comment_id': 835503633.0,\n", - " 'pull_request.merged': None,\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.state': None,\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'comment',\n", - " 'user.login': None,\n", - " 'user.type': None},\n", - " {'action': 'closed',\n", - " 'actor.id': 1851619,\n", - " 'actor.login': '1011X',\n", - " 'comment.author_association': None,\n", - " 'comment.body': None,\n", - " 'comment.commit_id': None,\n", - " 'comment.created_at': None,\n", - " 'comment.diff_hunk': None,\n", - " 'comment.id': None,\n", - " 'comment.in_reply_to_id': None,\n", - " 'comment.line': None,\n", - " 'comment.original_commit_id': None,\n", - " 'comment.original_line': None,\n", - " 'comment.original_position': None,\n", - " 'comment.original_start_line': None,\n", - " 'comment.path': None,\n", - " 'comment.position': None,\n", - " 'comment.side': None,\n", - " 'comment.start_line': None,\n", - " 'comment.start_side': None,\n", - " 'comment.updated_at': None,\n", - " 'created_at': datetime.datetime(2021, 5, 8, 20, 38, 38, tzinfo=),\n", - " 'issue.author': None,\n", - " 'issue.comment': None,\n", - " 'issue.comment_id': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.merged_by.login': '1011X',\n", - " 'pull_request.merged_by.type': 'User',\n", - " 'pull_request.state': 'closed',\n", - " 'review.author_association': None,\n", - " 'review.body': None,\n", - " 'review.commit_id': None,\n", - " 'review.id': None,\n", - " 'review.state': None,\n", - " 'review.submitted_at': None,\n", - " 'type': 'PullRequestEvent',\n", - " 'user.login': None,\n", - " 'user.type': None}]}" - ] - }, - "execution_count": 318, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample = ds[6]\n", - "sample" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "sample = ds[0]\n", - "pr_info = sample[\"pull_request_info\"]\n", - "head_info = sample[\"head_repo_info\"]\n", - "base_info = sample[\"base_repo_info\"]\n", - "events = sample[\"events\"]\n", - "\n", - "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", - "\n", - "header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}
\n", - "📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.
\n", - "Link: [{gh_link}]({gh_link})\"\"\"\n", - "pr_info_html = f\"\"\"\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AttributeDetail
🧾 PR Type{events[0]['type']}
🟢 PR State{pr_info['pull_request.state']}
👤 PR Author{pr_info['pull_request.user.login']}
🏷️ Head Branchref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}
🌳 Base Branch{base_info['pull_request.base.ref']}
\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AttributeDetail
🧾 PR TypePullRequestEvent
🟢 PR Stateopen
👤 PR Authordependabot[bot]
🏷️ Head Branchref: dependabot/npm_and_yarn/qs-6.5.3, label: AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3
🌳 Base Branchmaster
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# display pr_info_html as HTML\n", - "from IPython.display import HTML, display\n", - "display(HTML(pr_info_html))" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
actioncommentscreated_attype
0opened{'actor.id': 49699333, 'actor.login': 'dependa...2022-12-10 03:27:08+00:00PullRequestEvent
\n", - "
" - ], - "text/plain": [ - " action comments \n", - "0 opened {'actor.id': 49699333, 'actor.login': 'dependa... \\\n", - "\n", - " created_at type \n", - "0 2022-12-10 03:27:08+00:00 PullRequestEvent " - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.DataFrame(events)\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event TypePullRequestEvent
UserNone (type :None)
Review StateNone
From HeadAbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3
\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "def create_grouped_events(events):\n", - " df = pd.DataFrame(events)\n", - " df['created_at'] = pd.to_datetime(df['created_at'])\n", - " df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n", - " # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n", - " if len(df) == 1:\n", - " grouped_events = [[df.iloc[0].to_dict()]]\n", - " else:\n", - " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n", - " return grouped_events\n", - " \n", - "events = sample[\"events\"]\n", - "grouped_events = create_grouped_events(events)\n", - "original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", - "for thread in grouped_events:\n", - " # Start a new thread\n", - " thread_html = '
'\n", - " # Get the first event in the thread as a reference\n", - " first_event = thread[0]\n", - " \n", - " # Add shared parts of the events only once\n", - " text = f\"\"\"\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event Type{first_event['type']}
User{first_event['user.login']} (type :{first_event['user.type']})
Review State{first_event['review.state']}
From Head{head_info['pull_request.head.label']}
\n", - "
\n", - " \"\"\"\n", - " thread_html += text\n", - " \n", - " # Add the bodies of the comments for each event in the thread\n", - " for event in thread:\n", - " if event['comment.body']:\n", - " is_op = original_poster == event['user.login']\n", - " thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n", - " thread_html += '
'\n", - " display(HTML(thread_html))\n", - " if first_event['comment.path']:\n", - " path_html = f\"Path: {first_event['comment.path']}\"\n", - " display(HTML(path_html))\n", - " if first_event[\"comment.diff_hunk\"]:\n", - " print(first_event[\"comment.diff_hunk\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'dependabot[bot]'" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample[\"pull_request_info\"]['pull_request.user.login']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import ghdiff\n", - "import streamlit as st\n", - "import streamlit.components.v1 as components\n", - "from datasets import load_dataset\n", - "\n", - "\n", - "# save dataset as in \"bigcode/code_reviews_sample\"\n", - "ds = load_dataset(\"loubnabnl/clean_prs2\", split=\"train\")\n", - "size = len(ds)\n", - "\n", - "def show_diff_hunk(diff_hunk, position, context=5):\n", - " # exclude the first line with the @@ notation\n", - " lines = diff_hunk.split('\\n')\n", - " start_line = max(int(position) - context - 1, 0)\n", - " end_line = int(position)\n", - " actual_diff = lines[0] + '\\n' + '\\n'.join(lines[start_line + 1:end_line + 1])\n", - " focus = ghdiff.colorize(actual_diff)\n", - " full = ghdiff.colorize(diff_hunk)\n", - " # Wrap the diff hunk inside a scrollable div\n", - " scrollable_focus = f'
{focus}
'\n", - " scrollable_full = f'
{full}
'\n", - " if len(lines) <= 12:\n", - " return None, scrollable_full\n", - " return scrollable_focus, scrollable_full\n", - "\n", - "\n", - "def format_body(text, user, is_op=False):\n", - " color = \"#007bff\" if is_op else \"black\"\n", - " pr_body = f\"
👤{user}: {text}
\"\n", - " return pr_body\n", - "\n", - "\n", - "def create_grouped_events(events):\n", - " df = pd.DataFrame(events)\n", - " df['created_at'] = pd.to_datetime(df['created_at'])\n", - " df = df.sort_values(['comment.diff_hunk', 'comment.commit_id', 'created_at'])\n", - " # Group events in a the same thread using 'comment.diff_hunk' and 'comment.commit_id'\n", - " if len(df) == 1:\n", - " grouped_events = [[df.iloc[0].to_dict()]]\n", - " else:\n", - " grouped_events = [group.to_dict(orient='records') for _, group in df.groupby(['comment.diff_hunk', 'comment.commit_id'])]\n", - " return grouped_events\n", - "\n", - "\n", - "def get_pr_info(sample):\n", - " pr_info = sample[\"pull_request_info\"]\n", - " head_info = sample[\"head_repo_info\"]\n", - " base_info = sample[\"base_repo_info\"]\n", - " events = sample[\"events\"]\n", - "\n", - " gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", - " \n", - " header = f\"\"\"📝 **Title**: {pr_info['pull_request.title']}
\n", - " 📦 **GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}.
\n", - " Link: [{gh_link}]({gh_link})\"\"\"\n", - " pr_info_html = f\"\"\"\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AttributeDetail
🧾 PR Type{events[0]['type']}
🟢 PR State{pr_info['pull_request.state']}
👤 PR Author{pr_info['pull_request.user.login']}
🏷️ Head Branchref: {head_info['pull_request.head.ref']}, label: {head_info['pull_request.head.label']}
🌳 Base Branch{base_info['pull_request.base.ref']}
\n", - " \"\"\"\n", - " return header, pr_info_html\n", - "\n", - "\n", - "def display_events(sample):\n", - " events = sample[\"events\"]\n", - " grouped_events = create_grouped_events(events)\n", - " original_poster = sample[\"pull_request_info\"]['pull_request.user.login']\n", - " for thread in grouped_events:\n", - " # Start a new thread\n", - " thread_html = '
'\n", - " # Get the first event in the thread as a reference\n", - " first_event = thread[0]\n", - " \n", - " # Add shared parts of the events only once\n", - " text = f\"\"\"\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Event Type{first_event['type']}
User{first_event['user.login']} (type :{first_event['user.type']})
Review State{first_event['review.state']}
From Head{first_event['pull_request.head.label']}
\n", - "
\n", - " \"\"\"\n", - " thread_html += text\n", - " \n", - " # Add the bodies of the comments for each event in the thread\n", - " for event in thread:\n", - " if event['comment.body']:\n", - " is_op = original_poster == event['user.login']\n", - " thread_html += format_body(event['comment.body'], event['user.login'], is_op)\n", - " thread_html += '
'\n", - " st.markdown(thread_html, unsafe_allow_html=True)\n", - " if first_event['comment.path']:\n", - " path_html = f\"Path: {first_event['comment.path']}\"\n", - " st.markdown(path_html, unsafe_allow_html=True)\n", - " if first_event[\"comment.diff_hunk\"]:\n", - " focus_diff, full_diff = show_diff_hunk(first_event[\"comment.diff_hunk\"], first_event[\"comment.original_position\"])\n", - " if not focus_diff:\n", - " components.html(full_diff)\n", - " else:\n", - " components.html(focus_diff)\n", - " with st.expander(\"View Full diff hunk\"):\n", - " components.html(full_diff)\n", - " st.markdown(\"---\")\n", - "\n", - "def custom_css():\n", - " st.markdown(\"\"\"\n", - " \n", - " \"\"\", unsafe_allow_html=True)\n", - "\n", - "custom_css()\n", - "\n", - "\n", - "#st.set_page_config(page_icon=\":laptop:\", layout=\"wide\")\n", - "st.markdown(f\"\"\"\\\n", - " # GitHub Code Reviews Inspection 🔍\n", - " In this space you can inspect code reviews from GitHUb Pull Requests. Note that some may have empty text (e.g approval of a PR without a code comment).\n", - " You can find the dataset at [bigcode/code_reviews_sample](https://huggingface.co/datasets/bigcode/code_reviews_sample)\n", - " \"\"\"\n", - " )\n", - "example_index = st.number_input(f\"Example (0 to {size-1}):\", min_value=0, max_value=size-1, value=0, step=1)\n", - "\n", - "header, pr_info_html = get_pr_info(ds[example_index])\n", - "st.subheader(\"PR information\")\n", - "st.markdown(header, unsafe_allow_html=True)\n", - "st.markdown(pr_info_html, unsafe_allow_html=True)\n", - "st.markdown(\"
\", unsafe_allow_html=True)\n", - "st.subheader(\"Code review events\")\n", - "event_blocks = display_events(ds[example_index])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ValueError: The features can't be aligned because the key pull_request_info of features {'pull_request.guid': Value(dtype='string', id=None), 'pull_request.code_review_events': Value(dtype='string', id=None), 'pull_request.events': Value(dtype='string', id=None), 'pull_request.issue_events': Value(dtype='string', id=None), 'bucket': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'pull_request_info': {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)}, 'head_repo_info': {'pull_request.head.label': Value(dtype='string', id=None), 'pull_request.head.ref': Value(dtype='string', id=None), 'pull_request.head.repo.default_branch': Value(dtype='string', id=None), 'pull_request.head.repo.description': Value(dtype='null', id=None), 'pull_request.head.repo.homepage': Value(dtype='null', id=None), 'pull_request.head.repo.language': Value(dtype='string', id=None), 'pull_request.head.repo.license.name': Value(dtype='null', id=None), 'pull_request.head.repo.name': Value(dtype='string', id=None), 'pull_request.head.repo.owner.login': Value(dtype='string', id=None), 'pull_request.head.repo.owner.type': Value(dtype='string', id=None), 'pull_request.head.repo.private': Value(dtype='bool', id=None), 'pull_request.head.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.head.sha': Value(dtype='string', id=None), 'pull_request.head.user.login': Value(dtype='string', id=None), 'pull_request.head.user.type': Value(dtype='string', id=None)}, 'base_repo_info': {'pull_request.base.label': Value(dtype='string', id=None), 'pull_request.base.ref': Value(dtype='string', id=None), 'pull_request.base.repo.default_branch': Value(dtype='string', id=None), 'pull_request.base.repo.description': Value(dtype='null', id=None), 'pull_request.base.repo.forks_count': Value(dtype='int64', id=None), 'pull_request.base.repo.homepage': Value(dtype='null', id=None), 'pull_request.base.repo.language': Value(dtype='string', id=None), 'pull_request.base.repo.license.name': Value(dtype='null', id=None), 'pull_request.base.repo.name': Value(dtype='string', id=None), 'pull_request.base.repo.open_issues_count': Value(dtype='int64', id=None), 'pull_request.base.repo.owner.login': Value(dtype='string', id=None), 'pull_request.base.repo.owner.type': Value(dtype='string', id=None), 'pull_request.base.repo.private': Value(dtype='bool', id=None), 'pull_request.base.repo.stargazers_count': Value(dtype='int64', id=None), 'pull_request.base.repo.watchers_count': Value(dtype='int64', id=None), 'pull_request.base.sha': Value(dtype='string', id=None), 'pull_request.base.user.login': Value(dtype='string', id=None), 'pull_request.base.user.type': Value(dtype='string', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.label.name': Value(dtype='null', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None)}, 'events': [{'action': Value(dtype='string', id=None), 'created_at': Value(dtype='timestamp[us, tz=UTC]', id=None), 'issues_comments': {'action': Value(dtype='string', id=None), 'author': Value(dtype='null', id=None), 'comment': Value(dtype='null', id=None), 'comment_id': Value(dtype='null', id=None), 'datetime': Value(dtype='null', id=None), 'type': Value(dtype='string', id=None)}, 'review_comments': {'actor.id': Value(dtype='int64', id=None), 'actor.login': Value(dtype='string', id=None), 'comment.author_association': Value(dtype='null', id=None), 'comment.body': Value(dtype='null', id=None), 'comment.commit_id': Value(dtype='null', id=None), 'comment.created_at': Value(dtype='null', id=None), 'comment.diff_hunk': Value(dtype='null', id=None), 'comment.id': Value(dtype='null', id=None), 'comment.in_reply_to_id': Value(dtype='null', id=None), 'comment.line': Value(dtype='null', id=None), 'comment.original_commit_id': Value(dtype='null', id=None), 'comment.original_line': Value(dtype='null', id=None), 'comment.original_position': Value(dtype='null', id=None), 'comment.original_start_line': Value(dtype='null', id=None), 'comment.path': Value(dtype='null', id=None), 'comment.position': Value(dtype='null', id=None), 'comment.side': Value(dtype='null', id=None), 'comment.start_line': Value(dtype='null', id=None), 'comment.start_side': Value(dtype='null', id=None), 'comment.updated_at': Value(dtype='null', id=None), 'review.author_association': Value(dtype='null', id=None), 'review.body': Value(dtype='null', id=None), 'review.commit_id': Value(dtype='null', id=None), 'review.id': Value(dtype='null', id=None), 'review.state': Value(dtype='null', id=None), 'review.submitted_at': Value(dtype='null', id=None), 'user.login': Value(dtype='null', id=None), 'user.type': Value(dtype='null', id=None)}, 'type': Value(dtype='string', id=None)}]} has unexpected type - {'org.id': Value(dtype='int64', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} (expected either {'org.id': Value(dtype='null', id=None), 'public': Value(dtype='bool', id=None), 'pull_request.additions': Value(dtype='int64', id=None), 'pull_request.body': Value(dtype='string', id=None), 'pull_request.changed_files': Value(dtype='int64', id=None), 'pull_request.closed_at': Value(dtype='null', id=None), 'pull_request.comments': Value(dtype='int64', id=None), 'pull_request.commits': Value(dtype='int64', id=None), 'pull_request.created_at': Value(dtype='string', id=None), 'pull_request.deletions': Value(dtype='int64', id=None), 'pull_request.guid': Value(dtype='string', id=None), 'pull_request.id': Value(dtype='int64', id=None), 'pull_request.merged_at': Value(dtype='null', id=None), 'pull_request.merged_by.login': Value(dtype='null', id=None), 'pull_request.milestone.description': Value(dtype='null', id=None), 'pull_request.milestone.number': Value(dtype='null', id=None), 'pull_request.milestone.title': Value(dtype='null', id=None), 'pull_request.number': Value(dtype='int64', id=None), 'pull_request.review_comments': Value(dtype='int64', id=None), 'pull_request.state': Value(dtype='string', id=None), 'pull_request.title': Value(dtype='string', id=None), 'pull_request.user.id': Value(dtype='int64', id=None), 'pull_request.user.login': Value(dtype='string', id=None), 'repo.id': Value(dtype='int64', id=None), 'repo.name': Value(dtype='string', id=None)} or Value(\"null\").\n" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'type': 'PullRequestEvent',\n", - " 'action': 'opened',\n", - " 'actor.login': 'dependabot[bot]',\n", - " 'actor.id': 49699333,\n", - " 'user.login': None,\n", - " 'user.id': None,\n", - " 'user.type': None,\n", - " 'repo.name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n", - " 'repo.id': 210433834,\n", - " 'public': True,\n", - " 'created_at': '2022-12-10T03:27:08Z',\n", - " 'org.id': None,\n", - " 'org.login': None,\n", - " 'pull_request.id': 1157080683,\n", - " 'pull_request.number': 35,\n", - " 'pull_request.state': 'open',\n", - " 'pull_request.title': 'Bump qs from 6.5.2 to 6.5.3',\n", - " 'pull_request.body': 'Bumps [qs](https://github.com/ljharb/qs) from 6.5.2 to 6.5.3.\\n
\\nChangelog\\n

Sourced from qs\\'s changelog.

\\n
\\n

6.5.3

\\n
    \\n
  • [Fix] parse: ignore __proto__ keys (#428)
  • \\n
  • [Fix] utils.merge: avoid a crash with a null target and a truthy non-array source
  • \\n
  • [Fix] correctly parse nested arrays
  • \\n
  • [Fix] stringify: fix a crash with strictNullHandling and a custom filter/serializeDate (#279)
  • \\n
  • [Fix] utils: merge: fix crash when source is a truthy primitive & no options are provided
  • \\n
  • [Fix] when parseArrays is false, properly handle keys ending in []
  • \\n
  • [Fix] fix for an impossible situation: when the formatter is called with a non-string value
  • \\n
  • [Fix] utils.merge: avoid a crash with a null target and an array source
  • \\n
  • [Refactor] utils: reduce observable [[Get]]s
  • \\n
  • [Refactor] use cached Array.isArray
  • \\n
  • [Refactor] stringify: Avoid arr = arr.concat(...), push to the existing instance (#269)
  • \\n
  • [Refactor] parse: only need to reassign the var once
  • \\n
  • [Robustness] stringify: avoid relying on a global undefined (#427)
  • \\n
  • [readme] remove travis badge; add github actions/codecov badges; update URLs
  • \\n
  • [Docs] Clean up license text so it’s properly detected as BSD-3-Clause
  • \\n
  • [Docs] Clarify the need for "arrayLimit" option
  • \\n
  • [meta] fix README.md (#399)
  • \\n
  • [meta] add FUNDING.yml
  • \\n
  • [actions] backport actions from main
  • \\n
  • [Tests] always use String(x) over x.toString()
  • \\n
  • [Tests] remove nonexistent tape option
  • \\n
  • [Dev Deps] backport from main
  • \\n
\\n
\\n
\\n
\\nCommits\\n
    \\n
  • 298bfa5 v6.5.3
  • \\n
  • ed0f5dc [Fix] parse: ignore __proto__ keys (#428)
  • \\n
  • 691e739 [Robustness] stringify: avoid relying on a global undefined (#427)
  • \\n
  • 1072d57 [readme] remove travis badge; add github actions/codecov badges; update URLs
  • \\n
  • 12ac1c4 [meta] fix README.md (#399)
  • \\n
  • 0338716 [actions] backport actions from main
  • \\n
  • 5639c20 Clean up license text so it’s properly detected as BSD-3-Clause
  • \\n
  • 51b8a0b add FUNDING.yml
  • \\n
  • 45f6759 [Fix] fix for an impossible situation: when the formatter is called with a no...
  • \\n
  • f814a7f [Dev Deps] backport from main
  • \\n
  • Additional commits viewable in compare view
  • \\n
\\n
\\n
\\n\\n\\n[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=qs&package-manager=npm_and_yarn&previous-version=6.5.2&new-version=6.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)\\n\\nDependabot will resolve any conflicts with this PR as long as you don\\'t alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.\\n\\n[//]: # (dependabot-automerge-start)\\n[//]: # (dependabot-automerge-end)\\n\\n---\\n\\n
\\nDependabot commands and options\\n
\\n\\nYou can trigger Dependabot actions by commenting on this PR:\\n- `@dependabot rebase` will rebase this PR\\n- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it\\n- `@dependabot merge` will merge this PR after your CI passes on it\\n- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it\\n- `@dependabot cancel merge` will cancel a previously requested merge and block automerging\\n- `@dependabot reopen` will reopen this PR if it is closed\\n- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually\\n- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)\\n- `@dependabot use these labels` will set the current labels as the default for future PRs for this repo and language\\n- `@dependabot use these reviewers` will set the current reviewers as the default for future PRs for this repo and language\\n- `@dependabot use these assignees` will set the current assignees as the default for future PRs for this repo and language\\n- `@dependabot use this milestone` will set the current milestone as the default for future PRs for this repo and language\\n\\nYou can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/AbdElrahmanMuhammedNasr/WuzuufMasr/network/alerts).\\n\\n
',\n", - " 'pull_request.user.login': 'dependabot[bot]',\n", - " 'pull_request.user.id': 49699333,\n", - " 'pull_request.author_association': 'NONE',\n", - " 'pull_request.created_at': '2022-12-10T03:27:08Z',\n", - " 'pull_request.updated_at': '2022-12-10T03:27:08Z',\n", - " 'pull_request.closed_at': None,\n", - " 'pull_request.merged_at': None,\n", - " 'pull_request.merge_commit_sha': None,\n", - " 'pull_request.locked': False,\n", - " 'pull_request.assignee.login': None,\n", - " 'pull_request.assignee.id': None,\n", - " 'pull_request.assignee.type': None,\n", - " 'pull_request.assignee.site_admin': None,\n", - " 'pull_request.milestone.id': None,\n", - " 'pull_request.milestone.number': None,\n", - " 'pull_request.milestone.title': None,\n", - " 'pull_request.milestone.description': None,\n", - " 'pull_request.milestone.creator.login': None,\n", - " 'pull_request.milestone.creator.id': None,\n", - " 'pull_request.milestone.creator.type': None,\n", - " 'pull_request.milestone.creator.site_admin': None,\n", - " 'pull_request.milestone.open_issues': None,\n", - " 'pull_request.milestone.closed_issues': None,\n", - " 'pull_request.milestone.state': None,\n", - " 'pull_request.milestone.created_at': None,\n", - " 'pull_request.milestone.updated_at': None,\n", - " 'pull_request.milestone.due_on': None,\n", - " 'pull_request.milestone.closed_at': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.mergeable': None,\n", - " 'pull_request.mergeable_state': 'unknown',\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.id': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.merged_by.site_admin': None,\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.review_comments': 0,\n", - " 'pull_request.commits': 1,\n", - " 'pull_request.additions': 3,\n", - " 'pull_request.deletions': 3,\n", - " 'pull_request.changed_files': 1,\n", - " 'pull_request.label.id': None,\n", - " 'pull_request.label.name': None,\n", - " 'pull_request.label.color': None,\n", - " 'pull_request.label.default': None,\n", - " 'pull_request.head.label': 'AbdElrahmanMuhammedNasr:dependabot/npm_and_yarn/qs-6.5.3',\n", - " 'pull_request.head.ref': 'dependabot/npm_and_yarn/qs-6.5.3',\n", - " 'pull_request.head.sha': '94469b10a02fa77e95bb22aaa0fbcc16ef03edfd',\n", - " 'pull_request.head.user.login': 'AbdElrahmanMuhammedNasr',\n", - " 'pull_request.head.user.type': 'User',\n", - " 'pull_request.head.repo.name': 'WuzuufMasr',\n", - " 'pull_request.head.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n", - " 'pull_request.head.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n", - " 'pull_request.head.repo.owner.type': 'User',\n", - " 'pull_request.head.repo.private': False,\n", - " 'pull_request.head.repo.homepage': None,\n", - " 'pull_request.head.repo.description': None,\n", - " 'pull_request.head.repo.fork': False,\n", - " 'pull_request.head.repo.created_at': '2019-09-23T19:17:51Z',\n", - " 'pull_request.head.repo.updated_at': '2019-10-11T19:57:45Z',\n", - " 'pull_request.head.repo.pushed_at': '2022-12-10T03:27:07Z',\n", - " 'pull_request.head.repo.size': 1345,\n", - " 'pull_request.head.repo.stargazers_count': 0,\n", - " 'pull_request.head.repo.watchers_count': 0,\n", - " 'pull_request.head.repo.language': 'TypeScript',\n", - " 'pull_request.head.repo.has_issues': True,\n", - " 'pull_request.head.repo.has_projects': True,\n", - " 'pull_request.head.repo.has_downloads': True,\n", - " 'pull_request.head.repo.has_wiki': True,\n", - " 'pull_request.head.repo.has_pages': False,\n", - " 'pull_request.head.repo.forks_count': 0,\n", - " 'pull_request.head.repo.archived': False,\n", - " 'pull_request.head.repo.disabled': False,\n", - " 'pull_request.head.repo.open_issues_count': 24,\n", - " 'pull_request.head.repo.forks': 0,\n", - " 'pull_request.head.repo.open_issues': 24,\n", - " 'pull_request.head.repo.watchers': 0,\n", - " 'pull_request.head.repo.default_branch': 'master',\n", - " 'pull_request.head.repo.license.key': None,\n", - " 'pull_request.head.repo.license.spdx_id': None,\n", - " 'pull_request.head.repo.license.name': None,\n", - " 'pull_request.base.label': 'AbdElrahmanMuhammedNasr:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.sha': 'a7d0127c02152dca69c41f83afb1a0a4d0c0e004',\n", - " 'pull_request.base.user.login': 'AbdElrahmanMuhammedNasr',\n", - " 'pull_request.base.user.type': 'User',\n", - " 'pull_request.base.repo.name': 'WuzuufMasr',\n", - " 'pull_request.base.repo.full_name': 'AbdElrahmanMuhammedNasr/WuzuufMasr',\n", - " 'pull_request.base.repo.owner.login': 'AbdElrahmanMuhammedNasr',\n", - " 'pull_request.base.repo.owner.type': 'User',\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.description': None,\n", - " 'pull_request.base.repo.fork': False,\n", - " 'pull_request.base.repo.created_at': '2019-09-23T19:17:51Z',\n", - " 'pull_request.base.repo.updated_at': '2019-10-11T19:57:45Z',\n", - " 'pull_request.base.repo.pushed_at': '2022-12-10T03:27:07Z',\n", - " 'pull_request.base.repo.size': 1345,\n", - " 'pull_request.base.repo.stargazers_count': 0,\n", - " 'pull_request.base.repo.watchers_count': 0,\n", - " 'pull_request.base.repo.language': 'TypeScript',\n", - " 'pull_request.base.repo.has_issues': True,\n", - " 'pull_request.base.repo.has_projects': True,\n", - " 'pull_request.base.repo.has_downloads': True,\n", - " 'pull_request.base.repo.has_wiki': True,\n", - " 'pull_request.base.repo.has_pages': False,\n", - " 'pull_request.base.repo.forks_count': 0,\n", - " 'pull_request.base.repo.archived': False,\n", - " 'pull_request.base.repo.disabled': False,\n", - " 'pull_request.base.repo.open_issues_count': 24,\n", - " 'pull_request.base.repo.forks': 0,\n", - " 'pull_request.base.repo.open_issues': 24,\n", - " 'pull_request.base.repo.watchers': 0,\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.license.key': None,\n", - " 'pull_request.base.repo.license.spdx_id': None,\n", - " 'pull_request.base.repo.license.name': None,\n", - " 'pull_request.guid': 'AbdElrahmanMuhammedNasr/WuzuufMasr/pull/35'}]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "res = json.loads(small_ds[0]['pull_request.events'])\n", - "res" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'action': 'opened',\n", - " 'author': 'hillc-usgs',\n", - " 'comment': None,\n", - " 'comment_id': None,\n", - " 'datetime': '2021-06-24T17:23:03Z',\n", - " 'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", - " 'title': 'pygeoapi_plugins refit',\n", - " 'type': 'issue'},\n", - " {'action': 'created',\n", - " 'author': 'rmcd-mscb',\n", - " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \",\n", - " 'comment_id': 868826717.0,\n", - " 'datetime': '2021-06-25 20:51:35+00:00',\n", - " 'description': None,\n", - " 'title': None,\n", - " 'type': 'comment'}]" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues = issues[0][\"events\"]\n", - "issues" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "10\n" - ] - } - ], - "source": [ - "for i in range(3, 20):\n", - " row = small_ds[i]\n", - " events = load_json(row[\"pull_request.events\"])\n", - " reviews = load_json(row[\"pull_request.code_review_events\"])\n", - " issues = load_json(row[\"pull_request.issue_events\"])\n", - " if reviews:\n", - " print(i)\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len events 2, len reviews 1, len issues 1\n" - ] - } - ], - "source": [ - "row = small_ds[10]\n", - "events = load_json(row[\"pull_request.events\"])\n", - "reviews = load_json(row[\"pull_request.code_review_events\"])\n", - "issues = load_json(row[\"pull_request.issue_events\"])\n", - "print(f\"len events {len(events)}, len reviews {len(reviews)}, len issues {len(issues)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "events = load_json(row[\"pull_request.events\"])\n", - "reviews = load_json(row[\"pull_request.code_review_events\"])\n", - "issues = load_json(row[\"pull_request.issue_events\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "L = events + reviews + issues" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "events = load_json(row[\"pull_request.events\"])\n", - "reviews = load_json(row[\"pull_request.code_review_events\"])\n", - "issues = load_json(row[\"pull_request.issue_events\"])\n", - "assert len(issues) == 1\n", - "issues_events = issues[0][\"events\"]\n", - "# for each events in each category group all events sorted by \"created_at\" in one list\n", - "for e in issues_events:\n", - " e[\"created_at\"] = parse(e[\"datetime\"])\n", - " del e[\"datetime\"]\n", - "events = [update_datetime(e) for e in events]\n", - "reviews = [update_datetime(e) for e in reviews]\n", - "all_events = sorted(\n", - " events + reviews + issues_events,\n", - " key=lambda x: x[\"created_at\"]\n", - ")\n", - "\n", - "pr_info = {k: events[0][k] for k in pull_request_info_cols}\n", - "head_info = {k: events[0][k] for k in head_info_cols}\n", - "base_info = {k: events[0][k] for k in base_info_cols}\n", - "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", - "comments = [{\"type\": e[\"type\"],\n", - " \"action\": e[\"action\"],\n", - " \"created_at\": e[\"created_at\"],\n", - " \"review_comments\": get_review_info(e),\n", - " \"issues_comments\": get_issue_info(e)} for e in all_events]\n", - "new_row = {\"pull_request_info\": pr_info, \"head_repo_info\": head_info, \"base_repo_info\": base_info, \"events\": comments}" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['pull_request_info', 'head_repo_info', 'base_repo_info', 'events'])" - ] - }, - "execution_count": 131, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_row.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "**GitHub Repo**: ACWI-SSWD/nldi_flowtools, PR Number: 4, ID: 677298606\n", - "**GitHub Link**: https://github.com/ACWI-SSWD/nldi_flowtools/pull/4\n", - "----------------------------------------------------------------------------------------------------\n", - "Type: issue, action: opened, created_at: 2021-06-24 17:23:03+00:00\n", - "Author hillc-usgs did opened:\n", - "None\n", - "----------------------------------------------------------------------------------------------------\n", - "Type: PullRequestEvent, action: opened, created_at: 2021-06-24 17:23:04+00:00\n", - "Author hillc-usgs with association None did opened\n", - "----------------------------------------------------------------------------------------------------\n", - "Type: PullRequestReviewEvent, action: created, created_at: 2021-06-25 20:50:41+00:00\n", - "Author rmcd-mscb with association NONE did created\n", - "Review:\n", - "Thanks Cliff - Anders has been out this week, to keep things moving I'll merge the request and leave the branch for him to view when he gets back. \n", - "----------------------------------------------------------------------------------------------------\n", - "Type: PullRequestEvent, action: closed, created_at: 2021-06-25 20:50:54+00:00\n", - "Author rmcd-mscb with association None did closed\n", - "----------------------------------------------------------------------------------------------------\n", - "Type: comment, action: created, created_at: 2021-06-25 20:51:35+00:00\n", - "Author rmcd-mscb did created:\n", - "@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \n" - ] - } - ], - "source": [ - "pr_info = new_row[\"pull_request_info\"]\n", - "res = f\"**GitHub Repo**: {pr_info['repo.name']}, PR Number: {pr_info['pull_request.number']}, ID: {pr_info['pull_request.id']}\"\n", - "gh_link = f\"https://github.com/{pr_info['repo.name']}/pull/{pr_info['pull_request.number']}\"\n", - "res += f\"\\n**GitHub Link**: {gh_link}\"\n", - "print(res)\n", - "for i in range(len(new_row[\"events\"])):\n", - " e = new_row[\"events\"][i]\n", - " print(\"-\" * 100)\n", - " print(f\"Type: {e['type']}, action: {e['action']}, created_at: {e['created_at']}\")\n", - " action = e['action']\n", - "\n", - " if e['type'] in [\"issue\", \"comment\"]:\n", - " e = e[\"issues_comments\"]\n", - " print(f\"Author {e['author']} did {e['action']}:\\n{e['comment']}\")\n", - "\n", - " elif e['type'] in [\"PullRequestEvent\", \"PullRequestReviewCommentEvent\", \"PullRequestReviewEvent\"]:\n", - " reviews = e[\"review_comments\"]\n", - " print(f\"Author {reviews['actor.login']} with association {reviews['review.author_association']} did {action}\")\n", - " if reviews['review.body']:\n", - " print(f\"Review:\\n{reviews['review.body']}\")\n", - " if reviews['comment.body']:\n", - " print(f\"Comment:\\n{reviews['comment.body']}\")\n", - " if reviews['comment.diff_hunk']:\n", - " print(f\"Diff hunk:\\n{reviews['diff_hunk']}\")\n", - " print(f\"File path {reviews['path']}\")\n", - " else:\n", - " print(\"OTHER\")\n", - " print(e[\"type\"])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'pull_request.base.label': 'ACWI-SSWD:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n", - " 'pull_request.base.user.login': 'ACWI-SSWD',\n", - " 'pull_request.base.user.type': 'Organization',\n", - " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n", - " 'pull_request.base.repo.owner.type': 'Organization',\n", - " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.description': None,\n", - " 'pull_request.base.repo.language': 'Python',\n", - " 'pull_request.base.repo.watchers_count': 3,\n", - " 'pull_request.base.repo.open_issues_count': 1,\n", - " 'pull_request.base.repo.forks_count': 0,\n", - " 'pull_request.base.repo.name': 'nldi_flowtools',\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.stargazers_count': 3,\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.review_comments': 0,\n", - " 'pull_request.label.name': None}" - ] - }, - "execution_count": 144, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pr_info" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'type': 'PullRequestEvent',\n", - " 'action': 'opened',\n", - " 'actor.login': 'hillc-usgs',\n", - " 'actor.id': 84474574,\n", - " 'user.login': None,\n", - " 'user.id': None,\n", - " 'user.type': None,\n", - " 'repo.name': 'ACWI-SSWD/nldi_flowtools',\n", - " 'repo.id': 365244721,\n", - " 'public': True,\n", - " 'created_at': datetime.datetime(2021, 6, 24, 17, 23, 4, tzinfo=tzlocal()),\n", - " 'org.id': 17301770,\n", - " 'org.login': 'ACWI-SSWD',\n", - " 'pull_request.id': 677298606,\n", - " 'pull_request.number': 4,\n", - " 'pull_request.state': 'open',\n", - " 'pull_request.title': 'pygeoapi_plugins refit',\n", - " 'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", - " 'pull_request.user.login': 'hillc-usgs',\n", - " 'pull_request.user.id': 84474574,\n", - " 'pull_request.author_association': 'NONE',\n", - " 'pull_request.created_at': '2021-06-24T17:23:03Z',\n", - " 'pull_request.updated_at': '2021-06-24T17:23:03Z',\n", - " 'pull_request.closed_at': None,\n", - " 'pull_request.merged_at': None,\n", - " 'pull_request.merge_commit_sha': None,\n", - " 'pull_request.locked': False,\n", - " 'pull_request.assignee.login': None,\n", - " 'pull_request.assignee.id': None,\n", - " 'pull_request.assignee.type': None,\n", - " 'pull_request.assignee.site_admin': None,\n", - " 'pull_request.milestone.id': None,\n", - " 'pull_request.milestone.number': None,\n", - " 'pull_request.milestone.title': None,\n", - " 'pull_request.milestone.description': None,\n", - " 'pull_request.milestone.creator.login': None,\n", - " 'pull_request.milestone.creator.id': None,\n", - " 'pull_request.milestone.creator.type': None,\n", - " 'pull_request.milestone.creator.site_admin': None,\n", - " 'pull_request.milestone.open_issues': None,\n", - " 'pull_request.milestone.closed_issues': None,\n", - " 'pull_request.milestone.state': None,\n", - " 'pull_request.milestone.created_at': None,\n", - " 'pull_request.milestone.updated_at': None,\n", - " 'pull_request.milestone.due_on': None,\n", - " 'pull_request.milestone.closed_at': None,\n", - " 'pull_request.merged': False,\n", - " 'pull_request.mergeable': None,\n", - " 'pull_request.mergeable_state': 'unknown',\n", - " 'pull_request.merged_by.login': None,\n", - " 'pull_request.merged_by.id': None,\n", - " 'pull_request.merged_by.type': None,\n", - " 'pull_request.merged_by.site_admin': None,\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.review_comments': 0,\n", - " 'pull_request.commits': 5,\n", - " 'pull_request.additions': 321,\n", - " 'pull_request.deletions': 25,\n", - " 'pull_request.changed_files': 5,\n", - " 'pull_request.label.id': None,\n", - " 'pull_request.label.name': None,\n", - " 'pull_request.label.color': None,\n", - " 'pull_request.label.default': None,\n", - " 'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n", - " 'pull_request.head.ref': 'pygeoapi_plugins-refit',\n", - " 'pull_request.head.sha': '9143699913269aff0814979d932957efeb002eb1',\n", - " 'pull_request.head.user.login': 'ACWI-SSWD',\n", - " 'pull_request.head.user.type': 'Organization',\n", - " 'pull_request.head.repo.name': 'nldi_flowtools',\n", - " 'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", - " 'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n", - " 'pull_request.head.repo.owner.type': 'Organization',\n", - " 'pull_request.head.repo.private': False,\n", - " 'pull_request.head.repo.homepage': None,\n", - " 'pull_request.head.repo.description': None,\n", - " 'pull_request.head.repo.fork': False,\n", - " 'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n", - " 'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n", - " 'pull_request.head.repo.pushed_at': '2021-06-24T15:15:30Z',\n", - " 'pull_request.head.repo.size': 4309,\n", - " 'pull_request.head.repo.stargazers_count': 3,\n", - " 'pull_request.head.repo.watchers_count': 3,\n", - " 'pull_request.head.repo.language': 'Python',\n", - " 'pull_request.head.repo.has_issues': True,\n", - " 'pull_request.head.repo.has_projects': True,\n", - " 'pull_request.head.repo.has_downloads': True,\n", - " 'pull_request.head.repo.has_wiki': True,\n", - " 'pull_request.head.repo.has_pages': False,\n", - " 'pull_request.head.repo.forks_count': 0,\n", - " 'pull_request.head.repo.archived': False,\n", - " 'pull_request.head.repo.disabled': False,\n", - " 'pull_request.head.repo.open_issues_count': 1,\n", - " 'pull_request.head.repo.forks': 0,\n", - " 'pull_request.head.repo.open_issues': 1,\n", - " 'pull_request.head.repo.watchers': 3,\n", - " 'pull_request.head.repo.default_branch': 'master',\n", - " 'pull_request.head.repo.license.key': 'bsd-3-clause',\n", - " 'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n", - " 'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", - " 'pull_request.base.label': 'ACWI-SSWD:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n", - " 'pull_request.base.user.login': 'ACWI-SSWD',\n", - " 'pull_request.base.user.type': 'Organization',\n", - " 'pull_request.base.repo.name': 'nldi_flowtools',\n", - " 'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", - " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n", - " 'pull_request.base.repo.owner.type': 'Organization',\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.description': None,\n", - " 'pull_request.base.repo.fork': False,\n", - " 'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n", - " 'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n", - " 'pull_request.base.repo.pushed_at': '2021-06-24T15:15:30Z',\n", - " 'pull_request.base.repo.size': 4309,\n", - " 'pull_request.base.repo.stargazers_count': 3,\n", - " 'pull_request.base.repo.watchers_count': 3,\n", - " 'pull_request.base.repo.language': 'Python',\n", - " 'pull_request.base.repo.has_issues': True,\n", - " 'pull_request.base.repo.has_projects': True,\n", - " 'pull_request.base.repo.has_downloads': True,\n", - " 'pull_request.base.repo.has_wiki': True,\n", - " 'pull_request.base.repo.has_pages': False,\n", - " 'pull_request.base.repo.forks_count': 0,\n", - " 'pull_request.base.repo.archived': False,\n", - " 'pull_request.base.repo.disabled': False,\n", - " 'pull_request.base.repo.open_issues_count': 1,\n", - " 'pull_request.base.repo.forks': 0,\n", - " 'pull_request.base.repo.open_issues': 1,\n", - " 'pull_request.base.repo.watchers': 3,\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.license.key': 'bsd-3-clause',\n", - " 'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n", - " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", - " 'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'},\n", - " {'type': 'PullRequestEvent',\n", - " 'action': 'closed',\n", - " 'actor.login': 'rmcd-mscb',\n", - " 'actor.id': 11791580,\n", - " 'user.login': None,\n", - " 'user.id': None,\n", - " 'user.type': None,\n", - " 'repo.name': 'ACWI-SSWD/nldi_flowtools',\n", - " 'repo.id': 365244721,\n", - " 'public': True,\n", - " 'created_at': datetime.datetime(2021, 6, 25, 20, 50, 54, tzinfo=tzlocal()),\n", - " 'org.id': 17301770,\n", - " 'org.login': 'ACWI-SSWD',\n", - " 'pull_request.id': 677298606,\n", - " 'pull_request.number': 4,\n", - " 'pull_request.state': 'closed',\n", - " 'pull_request.title': 'pygeoapi_plugins refit',\n", - " 'pull_request.body': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", - " 'pull_request.user.login': 'hillc-usgs',\n", - " 'pull_request.user.id': 84474574,\n", - " 'pull_request.author_association': 'NONE',\n", - " 'pull_request.created_at': '2021-06-24T17:23:03Z',\n", - " 'pull_request.updated_at': '2021-06-25T20:50:53Z',\n", - " 'pull_request.closed_at': '2021-06-25T20:50:53Z',\n", - " 'pull_request.merged_at': '2021-06-25T20:50:53Z',\n", - " 'pull_request.merge_commit_sha': 'c0a8e850c8e627b0474b9059582e7a61e5fd3699',\n", - " 'pull_request.locked': False,\n", - " 'pull_request.assignee.login': None,\n", - " 'pull_request.assignee.id': None,\n", - " 'pull_request.assignee.type': None,\n", - " 'pull_request.assignee.site_admin': None,\n", - " 'pull_request.milestone.id': None,\n", - " 'pull_request.milestone.number': None,\n", - " 'pull_request.milestone.title': None,\n", - " 'pull_request.milestone.description': None,\n", - " 'pull_request.milestone.creator.login': None,\n", - " 'pull_request.milestone.creator.id': None,\n", - " 'pull_request.milestone.creator.type': None,\n", - " 'pull_request.milestone.creator.site_admin': None,\n", - " 'pull_request.milestone.open_issues': None,\n", - " 'pull_request.milestone.closed_issues': None,\n", - " 'pull_request.milestone.state': None,\n", - " 'pull_request.milestone.created_at': None,\n", - " 'pull_request.milestone.updated_at': None,\n", - " 'pull_request.milestone.due_on': None,\n", - " 'pull_request.milestone.closed_at': None,\n", - " 'pull_request.merged': True,\n", - " 'pull_request.mergeable': None,\n", - " 'pull_request.mergeable_state': 'unknown',\n", - " 'pull_request.merged_by.login': 'rmcd-mscb',\n", - " 'pull_request.merged_by.id': 11791580,\n", - " 'pull_request.merged_by.type': 'User',\n", - " 'pull_request.merged_by.site_admin': False,\n", - " 'pull_request.comments': 0,\n", - " 'pull_request.review_comments': 0,\n", - " 'pull_request.commits': 7,\n", - " 'pull_request.additions': 292,\n", - " 'pull_request.deletions': 1,\n", - " 'pull_request.changed_files': 5,\n", - " 'pull_request.label.id': None,\n", - " 'pull_request.label.name': None,\n", - " 'pull_request.label.color': None,\n", - " 'pull_request.label.default': None,\n", - " 'pull_request.head.label': 'ACWI-SSWD:pygeoapi_plugins-refit',\n", - " 'pull_request.head.ref': 'pygeoapi_plugins-refit',\n", - " 'pull_request.head.sha': '3e3fe0dfdfce5fe24c25231c3207c2d292b31165',\n", - " 'pull_request.head.user.login': 'ACWI-SSWD',\n", - " 'pull_request.head.user.type': 'Organization',\n", - " 'pull_request.head.repo.name': 'nldi_flowtools',\n", - " 'pull_request.head.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", - " 'pull_request.head.repo.owner.login': 'ACWI-SSWD',\n", - " 'pull_request.head.repo.owner.type': 'Organization',\n", - " 'pull_request.head.repo.private': False,\n", - " 'pull_request.head.repo.homepage': None,\n", - " 'pull_request.head.repo.description': None,\n", - " 'pull_request.head.repo.fork': False,\n", - " 'pull_request.head.repo.created_at': '2021-05-07T13:36:47Z',\n", - " 'pull_request.head.repo.updated_at': '2021-06-23T14:27:31Z',\n", - " 'pull_request.head.repo.pushed_at': '2021-06-25T20:50:53Z',\n", - " 'pull_request.head.repo.size': 4310,\n", - " 'pull_request.head.repo.stargazers_count': 3,\n", - " 'pull_request.head.repo.watchers_count': 3,\n", - " 'pull_request.head.repo.language': 'Python',\n", - " 'pull_request.head.repo.has_issues': True,\n", - " 'pull_request.head.repo.has_projects': True,\n", - " 'pull_request.head.repo.has_downloads': True,\n", - " 'pull_request.head.repo.has_wiki': True,\n", - " 'pull_request.head.repo.has_pages': False,\n", - " 'pull_request.head.repo.forks_count': 0,\n", - " 'pull_request.head.repo.archived': False,\n", - " 'pull_request.head.repo.disabled': False,\n", - " 'pull_request.head.repo.open_issues_count': 0,\n", - " 'pull_request.head.repo.forks': 0,\n", - " 'pull_request.head.repo.open_issues': 0,\n", - " 'pull_request.head.repo.watchers': 3,\n", - " 'pull_request.head.repo.default_branch': 'master',\n", - " 'pull_request.head.repo.license.key': 'bsd-3-clause',\n", - " 'pull_request.head.repo.license.spdx_id': 'BSD-3-Clause',\n", - " 'pull_request.head.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", - " 'pull_request.base.label': 'ACWI-SSWD:master',\n", - " 'pull_request.base.ref': 'master',\n", - " 'pull_request.base.sha': '4ce49143e7ce6e473554c3ebf7335a23d91ca91c',\n", - " 'pull_request.base.user.login': 'ACWI-SSWD',\n", - " 'pull_request.base.user.type': 'Organization',\n", - " 'pull_request.base.repo.name': 'nldi_flowtools',\n", - " 'pull_request.base.repo.full_name': 'ACWI-SSWD/nldi_flowtools',\n", - " 'pull_request.base.repo.owner.login': 'ACWI-SSWD',\n", - " 'pull_request.base.repo.owner.type': 'Organization',\n", - " 'pull_request.base.repo.private': False,\n", - " 'pull_request.base.repo.homepage': None,\n", - " 'pull_request.base.repo.description': None,\n", - " 'pull_request.base.repo.fork': False,\n", - " 'pull_request.base.repo.created_at': '2021-05-07T13:36:47Z',\n", - " 'pull_request.base.repo.updated_at': '2021-06-23T14:27:31Z',\n", - " 'pull_request.base.repo.pushed_at': '2021-06-25T20:50:53Z',\n", - " 'pull_request.base.repo.size': 4310,\n", - " 'pull_request.base.repo.stargazers_count': 3,\n", - " 'pull_request.base.repo.watchers_count': 3,\n", - " 'pull_request.base.repo.language': 'Python',\n", - " 'pull_request.base.repo.has_issues': True,\n", - " 'pull_request.base.repo.has_projects': True,\n", - " 'pull_request.base.repo.has_downloads': True,\n", - " 'pull_request.base.repo.has_wiki': True,\n", - " 'pull_request.base.repo.has_pages': False,\n", - " 'pull_request.base.repo.forks_count': 0,\n", - " 'pull_request.base.repo.archived': False,\n", - " 'pull_request.base.repo.disabled': False,\n", - " 'pull_request.base.repo.open_issues_count': 0,\n", - " 'pull_request.base.repo.forks': 0,\n", - " 'pull_request.base.repo.open_issues': 0,\n", - " 'pull_request.base.repo.watchers': 3,\n", - " 'pull_request.base.repo.default_branch': 'master',\n", - " 'pull_request.base.repo.license.key': 'bsd-3-clause',\n", - " 'pull_request.base.repo.license.spdx_id': 'BSD-3-Clause',\n", - " 'pull_request.base.repo.license.name': 'BSD 3-Clause \"New\" or \"Revised\" License',\n", - " 'pull_request.guid': 'ACWI-SSWD/nldi_flowtools/pull/4'}]" - ] - }, - "execution_count": 145, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'2021-06-24T17:23:03Z'" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events[0][\"created_at\"]\n", - "issues[0][\"events\"][0][\"datetime\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'action': 'created',\n", - " 'author': 'rmcd-mscb',\n", - " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \",\n", - " 'comment_id': 868826717.0,\n", - " 'datetime': '2021-06-25 20:51:35+00:00',\n", - " 'description': None,\n", - " 'title': None,\n", - " 'type': 'comment'}" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues[0][\"events\"][1]\n" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['repo', 'org', 'issue_id', 'issue_number', 'pull_request', 'events'])" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues[0].keys()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "all_events = sorted(\n", - " events + reviews + issues,\n", - " key=lambda x: x[\"created_at\"]\n", - ")\n", - "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n", - "head_info = {k: all_events[-1][k] for k in head_info_cols}\n", - "base_info = {k: all_events[-1][k] for k in base_info_cols}\n", - "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", - "comments = [{\"comments\": e[\"pull_request.comments\"],\n", - " \"review_comments\": e[\"pull_request.review_comments\"],\n", - " \"extra_review_info\": get_extra_review_info(e)} for e in all_events]\n", - "new_row = {\"pr_info\": pr_info, \"head_info\": head_info, \"base_info\": base_info, \"comments\": comments}" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "ename": "KeyError", - "evalue": "'created_at'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[38], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m new_row \u001b[39m=\u001b[39m merge_events(row)\n", - "Cell \u001b[0;32mIn[36], line 106\u001b[0m, in \u001b[0;36mmerge_events\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m 104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[0;32m--> 106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39;49m(\n\u001b[1;32m 107\u001b[0m events \u001b[39m+\u001b[39;49m reviews \u001b[39m+\u001b[39;49m issues,\n\u001b[1;32m 108\u001b[0m key\u001b[39m=\u001b[39;49m\u001b[39mlambda\u001b[39;49;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 109\u001b[0m )\n\u001b[1;32m 110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m 111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n", - "Cell \u001b[0;32mIn[36], line 108\u001b[0m, in \u001b[0;36mmerge_events..\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 102\u001b[0m issues \u001b[39m=\u001b[39m load_json(row[\u001b[39m\"\u001b[39m\u001b[39mpull_request.issue_events\u001b[39m\u001b[39m\"\u001b[39m])\n\u001b[1;32m 103\u001b[0m \u001b[39m# for each events in each category group all events sorted by \"created_at\" in one list\u001b[39;00m\n\u001b[1;32m 104\u001b[0m \u001b[39m# then merge all three lists\u001b[39;00m\n\u001b[1;32m 105\u001b[0m \u001b[39m# then sort by \"created_at\"\u001b[39;00m\n\u001b[1;32m 106\u001b[0m all_events \u001b[39m=\u001b[39m \u001b[39msorted\u001b[39m(\n\u001b[1;32m 107\u001b[0m events \u001b[39m+\u001b[39m reviews \u001b[39m+\u001b[39m issues,\n\u001b[0;32m--> 108\u001b[0m key\u001b[39m=\u001b[39m\u001b[39mlambda\u001b[39;00m x: x[\u001b[39m\"\u001b[39;49m\u001b[39mcreated_at\u001b[39;49m\u001b[39m\"\u001b[39;49m]\n\u001b[1;32m 109\u001b[0m )\n\u001b[1;32m 110\u001b[0m pr_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m pull_request_info_cols}\n\u001b[1;32m 111\u001b[0m head_info \u001b[39m=\u001b[39m {k: all_events[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m][k] \u001b[39mfor\u001b[39;00m k \u001b[39min\u001b[39;00m head_info_cols}\n", - "\u001b[0;31mKeyError\u001b[0m: 'created_at'" - ] - } - ], - "source": [ - "new_row = merge_events(row)" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'repo': 'ACWI-SSWD/nldi_flowtools',\n", - " 'org': 'ACWI-SSWD',\n", - " 'issue_id': 929448726,\n", - " 'issue_number': 4,\n", - " 'pull_request': {'number': 4.0,\n", - " 'repo': 'nldi_flowtools',\n", - " 'user_login': 'ACWI-SSWD'},\n", - " 'events': [{'action': 'opened',\n", - " 'author': 'hillc-usgs',\n", - " 'comment': None,\n", - " 'comment_id': None,\n", - " 'datetime': '2021-06-24T17:23:03Z',\n", - " 'description': 'This PR makes nldi_flowtools able to work with the new pygeoapi restructure, and makes it installable directly into the new tool. The processors are now contained within the library for nldi_flowtools directly, which makes it far simpler to roll out the plugin without needing coding modifications to the USGS pygeoapi tool.',\n", - " 'title': 'pygeoapi_plugins refit',\n", - " 'type': 'issue'},\n", - " {'action': 'created',\n", - " 'author': 'rmcd-mscb',\n", - " 'comment': \"@Anders-Hopkins - I merged Cliff's changes to keep things moving but you might want to review the changes for yourself when you get back. \",\n", - " 'comment_id': 868826717.0,\n", - " 'datetime': '2021-06-25 20:51:35+00:00',\n", - " 'description': None,\n", - " 'title': None,\n", - " 'type': 'comment'}]}" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "issues" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "# for each events in each category group all events sorted by \"created_at\" in one list\n", - "# then merge all three lists\n", - "# then sort by \"created_at\"\n", - "all_events = sorted(\n", - " events + reviews + issues,\n", - " key=lambda x: x[\"created_at\"]\n", - ")\n", - "pr_info = {k: all_events[-1][k] for k in pull_request_info_cols}\n", - "head_info = {k: all_events[-1][k] for k in head_info_cols}\n", - "base_info = {k: all_events[-1][k] for k in base_info_cols}\n", - "# each comment should have \"comments\" and \"review_comments\" fields with \"extra_review_info\" field\n", - "comments = [{\"comments\": e[\"pull_request.comments\"],\n", - " \"review_comments\": e[\"pull_request.review_comments\"],\n", - " \"extra_review_info\": get_extra_review_info(e)} for e in all_events]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pull_request_info_cols = [\n", - " \"repo.name\",\n", - " \"repo.id\",\n", - " \"org.id\",\n", - " \"public\",\n", - " \"pull_request.id\",\n", - " \"pull_request.guid\",\n", - " \"pull_request.number\",\n", - " \"pull_request.title\",\n", - " \"pull_request.body\",\n", - " \"pull_request.state\",\n", - " \"pull_request.user.login\",\n", - " \"pull_request.user.id\",\n", - " \"pull_request.created_at\",\n", - " \"pull_request.closed_at\",\n", - " \"pull_request.merged_at\",\n", - " \"pull_request.merged_by.login\",\n", - " \"pull_request.milestone.title\",\n", - " \"pull_request.milestone.description\",\n", - " \"pull_request.milestone.number\",\n", - " # commits\n", - " 'pull_request.commits',\n", - " 'pull_request.additions',\n", - " 'pull_request.deletions',\n", - " # changed files\n", - " 'pull_request.changed_files',\n", - "]\n", - "\n", - "comments = [\n", - " 'pull_request.comments',\n", - " 'pull_request.review_comments',\n", - " # for PR event\n", - " 'pull_request.label.name',\n", - " # review events only\n", - " 'review.state',\n", - " 'review.id', \n", - " 'review.body', \n", - " 'review.commit_id', \n", - " 'review.submitted_at', \n", - " 'review.author_association', '\n", - "]\n", - "\n", - "head_info_cols = [\n", - " \"pull_request.head.label\",\n", - " \"pull_request.head.ref\",\n", - " \"pull_request.head.user.login\",\n", - " \"pull_request.head.user.type\",\n", - " \"pull_request.head.repo.owner.login\",\n", - " \"pull_request.head.repo.owner.type\",\n", - " \"pull_request.head.repo.license.name\",\n", - " \"pull_request.head.sha\",\n", - " 'pull_request.head.repo.name',\n", - " 'pull_request.head.repo.owner.login',\n", - " 'pull_request.head.repo.homepage',\n", - " 'pull_request.head.repo.description',\n", - " 'pull_request.head.repo.language',\n", - " 'pull_request.head.repo.stargazers_count',\n", - " 'pull_request.head.repo.license.name',\n", - " 'pull_request.head.repo.default_branch',\n", - " 'pull_request.head.repo.private'\n", - "]\n", - "base_info_cols = [\n", - " \"pull_request.base.label\",\n", - " \"pull_request.base.ref\",\n", - " \"pull_request.base.sha\",\n", - " \"pull_request.base.user.login\",\n", - " \"pull_request.base.user.type\",\n", - " \"pull_request.base.repo.owner.login\",\n", - " \"pull_request.base.repo.owner.type\",\n", - " \"pull_request.base.repo.license.name\",\n", - " \"pull_request.base.repo.default_branch\",\n", - " \"pull_request.base.repo.description\",\n", - " \"pull_request.base.repo.language\",\n", - " \"pull_request.base.repo.watchers_count\",\n", - " \"pull_request.base.repo.open_issues_count\",\n", - " \"pull_request.base.repo.forks_count\",\n", - " 'pull_request.base.repo.name',\n", - " 'pull_request.base.repo.owner.login',\n", - " 'pull_request.base.repo.homepage',\n", - " 'pull_request.base.repo.description',\n", - " 'pull_request.base.repo.language',\n", - " 'pull_request.base.repo.stargazers_count',\n", - " 'pull_request.base.repo.private',\n", - "]\n", - "# drop \"repo.name\", \"repo.id\", \"public\" so they are not duplicated and keep relevant columns that might change\n", - "event_cols = [\n", - " col\n", - " for col in df.columns\n", - " if (not col.startswith(\"pull_request.\"))\n", - " and col not in [\"repo.name\", \"repo.id\", \"public\"]\n", - "] + [\n", - " \"pull_request.head.label\",\n", - " \"pull_request.head.ref\",\n", - " \"pull_request.head.sha\",\n", - " \"pull_request.title\",\n", - "]" + "merged_ds.push_to_hub(\"loubnabnl/code_reviews_500k\")" ] } ],