diff --git a/data_analysis/stackoverflow/README.md b/data_analysis/stackoverflow/README.md new file mode 100644 index 0000000..089cc28 --- /dev/null +++ b/data_analysis/stackoverflow/README.md @@ -0,0 +1,3 @@ +## Code for processing StackExchange data + +Code for processing stackexchange data dump available in `h4_code` (to build https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) and `other`, notebook for further processing (e.g convert all HTML to Markdown) in `StackExchangeProcessing.ipynb` (to build https://huggingface.co/datasets/lvwerra/stack-exchange-paired) diff --git a/data_analysis/stackoverflow/StackExchangeProcessing.ipynb b/data_analysis/stackoverflow/StackExchangeProcessing.ipynb new file mode 100644 index 0000000..d37df68 --- /dev/null +++ b/data_analysis/stackoverflow/StackExchangeProcessing.ipynb @@ -0,0 +1,718 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 86, + "id": "7821c501-8c5d-4af6-81cd-caa6ad0bd58c", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, DatasetDict\n", + "from datasets import concatenate_datasets\n", + "from IPython.display import HTML\n", + "\n", + "from tqdm import tqdm\n", + "import re \n", + "import numpy as np\n", + "from markdownify import markdownify as md" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "dc821970-efdb-407f-bd79-59da09323280", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Found cached dataset parquet (/home/leandro/.cache/huggingface/datasets/HuggingFaceH4___parquet/HuggingFaceH4--stack-exchange-preferences-1d2bff9ecb5ffe2a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)\n" + ] + }, + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['qid', 'question', 'answers', 'date', 'metadata'],\n", + " num_rows: 10807695\n", + "})" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"HuggingFaceH4/stack-exchange-preferences\", split=\"train\", num_proc=16)\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "0d8d8729-6d6b-4791-a24a-cb112c399bd0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
I have been wanting to learn about 3D printing a long time so I really want this site to succeed but I have no previous experience with the subject.
\n", + "\n", + "I was wondering how can I help the site at this early stage. I thought about asking about how to get started with 3D printing but SE explicitly discourages \"easy\" questions in the private beta.
\n", + "\n", + "What can newbies like me do for the site at this stage besides voting questions and answers?
\n" + ], + "text/plain": [ + "Looking at the library code, seems all events are renamed removing 'fileupload' ... so 'fileuploaddone' becomes just 'done'. It is valid for all other callbacks.\\nlook at this section:
\\n\\n // Other callbacks:\\n // Callback for the submit event of each file upload:\\n // submit: function (e, data) {}, // .bind('fileuploadsubmit', func);\\n // Callback for the start of each file upload request:\\n // send: function (e, data) {}, // .bind('fileuploadsend', func);\\n // Callback for successful uploads:\\n // done: function (e, data) {}, // .bind('fileuploaddone', func);\\n // Callback for failed (abort or error) uploads:\\n // fail: function (e, data) {}, // .bind('fileuploadfail', func);\\n // Callback for completed (success, abort or error) requests:\\n // always: function (e, data) {}, // .bind('fileuploadalways', func);\\n // Callback for upload progress events:\\n // progress: function (e, data) {}, // .bind('fileuploadprogress', func);\\n // Callback for global upload progress events:\\n // progressall: function (e, data) {}, // .bind('fileuploadprogressall', func);\\n // Callback for uploads start, equivalent to the global ajaxStart event:\\n // start: function (e) {}, // .bind('fileuploadstart', func);\\n // Callback for uploads stop, equivalent to the global ajaxStop event:\\n // stop: function (e) {}, // .bind('fileuploadstop', func);\\n // Callback for change events of the fileInput(s):\\n // change: function (e, data) {}, // .bind('fileuploadchange', func);\\n // Callback for paste events to the pasteZone(s):\\n // paste: function (e, data) {}, // .bind('fileuploadpaste', func);\\n // Callback for drop events of the dropZone(s):\\n // drop: function (e, data) {}, // .bind('fileuploaddrop', func);\\n // Callback for dragover events of the dropZone(s):\\n // dragover: function (e) {}, // .bind('fileuploaddragover', func);\\n\\n\\nIf you have some doubts about what's happening, just look at the code inside. This library is not compressed so it is easy to see. for example
\\n\\n// start: function (e) {}, // .bind('fileuploadstart', func);\\n\\n\\nstart callback is implemented. fileuploadstart is not.
Check if the server-side uploading script returns a JSON reply - in my case it didn\\'t work when the reply was empty, but file was uploaded successfully.
\\n\\nSo, below is working for me with jQuery 1.9.1 and the newest version of the \"jQuery File Upload Plugin\" - 5.21.3
\\n\\n$(\"#fileupload\").bind(\"fileuploaddone\", function (e, data) {\\n console.log(\"fileuploaddone event fired\");\\n});\\n\\n'}],\n",
+ " 'date': '2012/10/15',\n",
+ " 'metadata': ['https://Stackoverflow.com/questions/12891264',\n",
+ " 'https://Stackoverflow.com',\n",
+ " 'https://Stackoverflow.com/users/767244/'],\n",
+ " 'response_j': \"Looking at the library code, seems all events are renamed removing 'fileupload' ... so 'fileuploaddone' becomes just 'done'. It is valid for all other callbacks.\\nlook at this section:\\n\\n```\\n // Other callbacks:\\n // Callback for the submit event of each file upload:\\n // submit: function (e, data) {}, // .bind('fileuploadsubmit', func);\\n // Callback for the start of each file upload request:\\n // send: function (e, data) {}, // .bind('fileuploadsend', func);\\n // Callback for successful uploads:\\n // done: function (e, data) {}, // .bind('fileuploaddone', func);\\n // Callback for failed (abort or error) uploads:\\n // fail: function (e, data) {}, // .bind('fileuploadfail', func);\\n // Callback for completed (success, abort or error) requests:\\n // always: function (e, data) {}, // .bind('fileuploadalways', func);\\n // Callback for upload progress events:\\n // progress: function (e, data) {}, // .bind('fileuploadprogress', func);\\n // Callback for global upload progress events:\\n // progressall: function (e, data) {}, // .bind('fileuploadprogressall', func);\\n // Callback for uploads start, equivalent to the global ajaxStart event:\\n // start: function (e) {}, // .bind('fileuploadstart', func);\\n // Callback for uploads stop, equivalent to the global ajaxStop event:\\n // stop: function (e) {}, // .bind('fileuploadstop', func);\\n // Callback for change events of the fileInput(s):\\n // change: function (e, data) {}, // .bind('fileuploadchange', func);\\n // Callback for paste events to the pasteZone(s):\\n // paste: function (e, data) {}, // .bind('fileuploadpaste', func);\\n // Callback for drop events of the dropZone(s):\\n // drop: function (e, data) {}, // .bind('fileuploaddrop', func);\\n // Callback for dragover events of the dropZone(s):\\n // dragover: function (e) {}, // .bind('fileuploaddragover', func);\\n\\n```\\n\\nIf you have some doubts about what's happening, just look at the code inside. This library is not compressed so it is easy to see. for example\\n\\n```\\n// start: function (e) {}, // .bind('fileuploadstart', func);\\n\\n```\\n\\n`start` callback is implemented. `fileuploadstart` is not.\",\n",
+ " 'response_k': 'Check if the server-side uploading script returns a JSON reply - in my case it didn\\'t work when the reply was empty, but file was uploaded successfully.\\n\\nSo, below is working for me with jQuery 1.9.1 and the newest version of the \"jQuery File Upload Plugin\" - 5.21.3\\n\\n```\\n$(\"#fileupload\").bind(\"fileuploaddone\", function (e, data) {\\n console.log(\"fileuploaddone event fired\");\\n});\\n\\n```'}"
+ ]
+ },
+ "execution_count": 93,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ds_result[\"finetune\"][0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "id": "2c96653b-7a5a-4cae-a327-b6aa77aa5850",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ds_result = ds_result.remove_columns([\"answers\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "id": "15c2e5ee-7c7d-4e98-9e63-e5d37a9354aa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "DatasetDict({\n",
+ " finetune: Dataset({\n",
+ " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+ " num_rows: 7440923\n",
+ " })\n",
+ " reward: Dataset({\n",
+ " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+ " num_rows: 7441998\n",
+ " })\n",
+ " rl: Dataset({\n",
+ " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+ " num_rows: 7435908\n",
+ " })\n",
+ " evaluation: Dataset({\n",
+ " features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],\n",
+ " num_rows: 4483004\n",
+ " })\n",
+ "})"
+ ]
+ },
+ "execution_count": 95,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ds_result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "id": "4d42b35c-5252-4b49-ba4b-20818bc9e086",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "finetune\n",
+ "reward\n",
+ "rl\n",
+ "evaluation\n"
+ ]
+ }
+ ],
+ "source": [
+ "for key in ds_result:\n",
+ " print(key)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "id": "e32c11d7-a88e-4d92-9dfc-92b2a67c5455",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import time\n",
+ "from multiprocessing import Pool\n",
+ "from tqdm import tqdm\n",
+ "\n",
+ "from huggingface_hub import Repository\n",
+ "\n",
+ "\n",
+ "def save_shard(shard_tuple):\n",
+ " \"\"\"Save shard\"\"\"\n",
+ " filename, shard = shard_tuple\n",
+ " # use to_json instead to save as json file\n",
+ " shard.to_parquet(filename)\n",
+ "\n",
+ "\n",
+ "def save_manual_shards(ds, user=\"lvwerra\", remote_dataset_repo=\"stack-exchange-paired\", subfolder=\"train\"):\n",
+ " \"\"\"Save sharded data\n",
+ " Args:\n",
+ " ds (Dataset): dataset to be saved\n",
+ " user (str): user name\n",
+ " remote_dataset_repo (str): remote dataset repository\n",
+ " out_path (str): path to save the shards\"\"\"\n",
+ " # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO\n",
+ " # you can save the shards inside it and do git add/commit/push to push data to the hub\n",
+ " out_path = remote_dataset_repo\n",
+ " # if out path doesnt already exist\n",
+ " if not os.path.exists(out_path):\n",
+ " repo = Repository(\n",
+ " local_dir=out_path,\n",
+ " clone_from=user + \"/\" + remote_dataset_repo,\n",
+ " repo_type=\"dataset\",\n",
+ " private=False,\n",
+ " use_auth_token=True,\n",
+ " git_user=user,\n",
+ " )\n",
+ "\n",
+ " # files will be numerous we save them in a folder called data inside out_path\n",
+ " if not os.path.exists(out_path):\n",
+ " os.mkdir(out_path + \"/data\")\n",
+ " os.mkdir(out_path + f\"/data/{subfolder}\")\n",
+ " \n",
+ " SHARD_SIZE = 1000 << 20\n",
+ " if ds._indices is not None:\n",
+ " dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data)\n",
+ " else:\n",
+ " dataset_nbytes = ds.data.nbytes\n",
+ " num_shards = int(dataset_nbytes / SHARD_SIZE) + 1\n",
+ " print(f\"Number of shards: {num_shards}\")\n",
+ "\n",
+ " print(\"sharding the dataset\")\n",
+ " t_start = time.time()\n",
+ " shards = (\n",
+ " ds.shard(num_shards=num_shards, index=i, contiguous=True)\n",
+ " for i in range(num_shards)\n",
+ " )\n",
+ " # use f\"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json\" instead for json files\n",
+ " filenames = (\n",
+ " f\"{out_path}/data/{subfolder}/train-{index:05d}-of-{num_shards:05d}.parquet\"\n",
+ " for index in range(num_shards)\n",
+ " )\n",
+ "\n",
+ " with Pool(16) as p:\n",
+ " list(\n",
+ " tqdm(\n",
+ " p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4),\n",
+ " total=num_shards,\n",
+ " )\n",
+ " )\n",
+ " print(f\"Time to save dataset: {time.time()-t_start:.2f}\")\n",
+ " # to push dataset to hub do: git add/commit/push inside OUT_PATH"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "id": "a90664eb-5c54-4fae-9a8a-d509bb2abdfe",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of shards: 20\n",
+ "sharding the dataset\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:28<00:00, 1.43s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Time to save dataset: 29.15\n",
+ "Number of shards: 20\n",
+ "sharding the dataset\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00, 1.15s/it]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Time to save dataset: 23.42\n",
+ "Number of shards: 20\n",
+ "sharding the dataset\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:10<00:00, 1.83it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Time to save dataset: 11.36\n",
+ "Number of shards: 12\n",
+ "sharding the dataset\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:10<00:00, 1.12it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Time to save dataset: 11.13\n"
+ ]
+ }
+ ],
+ "source": [
+ "for key in ds_result:\n",
+ " save_manual_shards(ds_result[key], subfolder=key)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d62f5a7f-2a23-4e0d-9e49-b29f88ea8c13",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/data_analysis/stackoverflow/h4_code/README.md b/data_analysis/stackoverflow/h4_code/README.md
new file mode 100644
index 0000000..a190c44
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/README.md
@@ -0,0 +1,28 @@
+# Scripts for preference model pretraining data
+
+### Stack Exchange
+Note: Stack Exchange Data Dump has a license requiring the addition of author's and links to the original material, see more [here](https://archive.org/details/stackexchange).
+
+1) `stack_exchange_explore.py`: example script for filtering stack exchange data to the question & answer format in Askell et al. 2021 on preference model pretraining (PMP).
+
+To run this code (from scratch including data download and faster processing), do the following:
+Identify the raw data directory you're hoping to process, `ex_data_url`, and related data variables (further string optimizations can be added).
+The script will pull raw data if you need it, uncompress it, and process the file to text.
+
+```shell
+python scripts/data/pmp/stack_exchange_explore.py --stack_exchange=pets --save=True
+```
+
+2) `stack_exchange_process.py`: same as above, but designed to be run on a large machine to process all files consecutively.
+It is a long for-loop over desired exchanges.
+
+```shell
+python scripts/data/pmp/stack_exchange_process.py --save_path=/path/to/hf-dataset
+```
+
+3) `binarize.py`: used to binarize the pre-filter Stack Exchange data
+```shell
+python scripts/data/pmp/binarize.py --save_path=/path/to/hf-dataset
+```
+
+Credits: code from HuggingFaceH4 team
diff --git a/data_analysis/stackoverflow/h4_code/binarize.py b/data_analysis/stackoverflow/h4_code/binarize.py
new file mode 100644
index 0000000..79bcce8
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/binarize.py
@@ -0,0 +1,117 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+from datasets import Dataset, concatenate_datasets, load_dataset
+
+from h4.data.utils import save_dataset_shards
+
+
+H4_DIR = Path(__file__).resolve().parents[3]
+DATA_DIR = H4_DIR / "data"
+
+if __name__ == "__main__":
+ parser = ArgumentParser()
+ parser.add_argument("--debug", action="store_true", help="Added print statements / limit data size for debugging")
+ parser.add_argument(
+ "--output_dir",
+ default=f"{DATA_DIR}/pmp-binarized",
+ type=str,
+ help="Where to save the processed dataset",
+ )
+ parser.add_argument(
+ "--exchange_name",
+ type=str,
+ default=None,
+ help="Optional argument to specify a specific subsection of the dataset",
+ )
+ parser.add_argument(
+ "--binary_score", type=int, default=8, help="Score assigned to binarized pairs for preference data."
+ )
+ parser.add_argument(
+ "--stream_data", action="store_true", help="Optionally stream data, which can be useful with weaker computers"
+ )
+ parser.set_defaults(debug=False, stream_data=False) # default will process full dataset
+
+ args = parser.parse_args()
+ specific_exchange = args.exchange_name
+ stream_dataset = args.stream_data
+ binary_score = args.binary_score
+
+ if specific_exchange:
+ data_dir = "data/" + args.exchange_name
+ else:
+ data_dir = None
+
+ if args.debug:
+ data_len_limit = 10000
+ else:
+ data_len_limit = np.inf
+
+ dataset = load_dataset(
+ "HuggingFaceH4/pmp-stack-exchange",
+ data_dir=data_dir,
+ split="train",
+ streaming=stream_dataset,
+ )
+
+ pmp_data = []
+ for i, d in enumerate(iter(dataset)):
+ # check debug limit, quit if in debug mode (don't save)
+ if i > data_len_limit:
+ print("Early exit for debug mode!")
+ print(pmp_data)
+ break
+
+ question = d["question"]
+ answers = d["answers"]
+ num_answers = len(answers)
+
+ answer_scores = [a["pm_score"] for a in answers]
+ if len(np.unique(answer_scores)) < 2:
+ print(f"PM Scores are {answer_scores}, skipping this question {i}")
+ else:
+ # Sample 2 unique scores for binarization
+ dif_scores = False
+ while not dif_scores:
+ # print("infinite loop...?")
+ two_answers = random.sample(answers, 2)
+
+ if two_answers[0]["pm_score"] != two_answers[1]["pm_score"]:
+ dif_scores = True
+
+ answer_0 = two_answers[0]
+ answer_1 = two_answers[1]
+ text_0 = "Question: " + question + "\n" + "Answer: " + answer_0["text"]
+ text_1 = "Question: " + question + "\n" + "Answer: " + answer_1["text"]
+ score_0 = binary_score
+ score_1 = binary_score
+
+ pmp_data.append({"context": text_0, "score": score_0})
+ pmp_data.append({"context": text_1, "score": score_1})
+
+ # Save binarized data
+ sublist_len = 100000
+
+ print(f"Dataset length is {len(pmp_data)}")
+ # bypass known issue in arrow https://issues.apache.org/jira/browse/ARROW-17137
+ print(f"Processed dataset length > {sublist_len}, processing to HF dataset in chunks")
+ chunks = [pmp_data[x : x + sublist_len] for x in range(0, len(pmp_data), sublist_len)]
+ ds_chunks = [Dataset.from_list(ch) for ch in chunks]
+ ds = concatenate_datasets(ds_chunks)
+
+ save_dataset_shards(ds, args.output_dir, subset="stackexchange", shard_size="100MB")
diff --git a/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py b/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py
new file mode 100644
index 0000000..33a8746
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/stack_exchange_explore.py
@@ -0,0 +1,305 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import time
+
+
+try:
+ from lxml import etree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+from argparse import ArgumentParser
+
+import numpy as np
+
+
+parser = ArgumentParser()
+parser.add_argument("--stack_exchange", default="ai", type=str, help="Which stack exchange data to process")
+parser.add_argument(
+ "--save_to_text", default=False, type=bool, help="Whether or not the outputs are saved to a text file."
+)
+parser.add_argument("--debug", default=False, type=bool, help="Added print statements for debugging")
+
+args = parser.parse_args()
+
+save = args.save_to_text
+se_name = args.stack_exchange + ".stackexchange.com"
+DEBUG = args.debug
+
+
+start_time = time.time()
+
+data_dir = "data/"
+if not os.path.exists(data_dir):
+ os.mkdir(data_dir)
+
+# check if unpacked data exists:
+ex_data_file = data_dir + se_name + "/Posts.xml"
+if not os.path.exists(ex_data_file):
+ # get raw data
+ ex_data_file_7z = se_name + ".7z"
+ if not os.path.exists(data_dir + ex_data_file_7z):
+ print("Loading raw data, this can take a second!")
+ import py7zr
+ import requests
+
+ ex_data_url = (
+ "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/resolve/main/"
+ + ex_data_file_7z
+ )
+ response = requests.get(ex_data_url, allow_redirects=True)
+ filename = os.path.basename(ex_data_url)
+
+ if response.status_code == 200:
+ with open(data_dir + filename, "wb") as out:
+ out.write(response.content)
+ os.mkdir(data_dir + se_name)
+ with py7zr.SevenZipFile(data_dir + filename, "r") as archive:
+ archive.extractall(data_dir + se_name + "/")
+ else:
+ print("Request failed: %d" % response.status_code)
+
+ print("Loaded data, now processing!")
+
+# load extracted xml files
+local_path = data_dir + se_name + "/" # "ai.stackexchange.com/"
+posts_subpath = "Posts.xml"
+votes_subpath = "Votes.xml"
+users_subpath = "Users.xml"
+
+"""
+XML file structure:
+* PostTypeID ranges from 1: Question, 2: Answer, ....
+* We only want posts with AcceptedAnswerId fields
+
+(docs https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede)
+"""
+
+
+def print_dict(d):
+ for key, val in d.items():
+ print(f"{key}, {val}")
+
+
+def simplify_date(date_string):
+ date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S")
+ return date.strftime("%Y/%m/%d")
+
+
+user_info = {-1: "(user-deleted)"}
+question_info = {}
+answer_info = {}
+
+# extract user data for license
+with open(local_path + users_subpath, "rb") as f: # Users file
+ tree = ET.parse(f)
+ for exchange in tree.iter("row"):
+ tag = int(exchange.attrib["Id"])
+ user_info[tag] = str(exchange.attrib["DisplayName"])
+
+if DEBUG:
+ print_dict(user_info)
+
+with open(local_path + posts_subpath, "rb") as f: # Posts file
+ tree = ET.parse(f)
+
+ # process questions, find answers next
+ # note, could do this all in one loop and store anything is memory is cheaper than processing speed
+
+ # iterator through all rows
+ for exchange in tree.iter("row"):
+ # find 2+ answers
+ if "AnswerCount" in exchange.attrib:
+ ans_count = int(exchange.attrib["AnswerCount"])
+
+ # only save questions with >= 2 answers
+ if ans_count >= 2:
+ tag = int(exchange.attrib["Id"])
+
+ result = {}
+ result["Body"] = exchange.attrib["Body"]
+
+ # store some metadata
+ result["AnswerCount"] = ans_count
+ result["PostScore"] = int(exchange.attrib["Score"])
+
+ # save metadata
+ if "OwnerUserId" in exchange.attrib:
+ user_id = int(exchange.attrib["OwnerUserId"])
+ else:
+ user_id = -1 # deleted user redirect to community page
+
+ result["Author"] = user_id # should fail for some deleted entries
+ result["metadata"] = [
+ "https://" + se_name + "/questions/" + str(tag),
+ "https://" + se_name,
+ "https://"
+ + se_name
+ + "/users/"
+ + str(user_id)
+ + "/", # don't include username afterwards to avoid case with spaces in name (string regex problem)
+ ]
+ result["Date"] = simplify_date(exchange.attrib["CreationDate"])
+
+ # if accepted answer, store it
+ if "AcceptedAnswerId" in exchange.attrib:
+ accepted_ans = int(exchange.attrib["AcceptedAnswerId"])
+ result["AcceptedAnswerId"] = accepted_ans
+ else:
+ result["AcceptedAnswerId"] = None
+
+ question_info[tag] = result
+ if DEBUG:
+ print_dict(question_info[tag])
+
+ # process looking for answers
+ for i, exchange in enumerate(tree.iter("row")):
+ # answers are ID type 2
+ if int(exchange.attrib["PostTypeId"]) == 2:
+ # get parent, check if in question_info
+ parent = int(exchange.attrib["ParentId"])
+ # note, that parent will be same as tag above in answer_info and question_info
+
+ # log if parent is in questions (multiple answers for preference model)
+ if parent in question_info:
+ # info for answers
+ ans_text = exchange.attrib["Body"]
+ ans_score = int(exchange.attrib["Score"])
+ ans_id = int(exchange.attrib["Id"]) # extra score if this ID matches accept id above
+
+ # save metadata
+ if "OwnerUserId" in exchange.attrib:
+ user_id = int(exchange.attrib["OwnerUserId"])
+ else:
+ user_id = -1 # deleted user
+ # we'll need to store multiple answers per tag
+ if parent not in answer_info:
+ answer_info[parent] = {}
+ answer_info[parent]["Text"] = []
+ answer_info[parent]["Score"] = []
+ answer_info[parent]["Id"] = []
+ answer_info[parent]["Author"] = []
+ answer_info[parent]["AuthorNames"] = []
+
+ answer_info[parent]["Text"].append(ans_text)
+ answer_info[parent]["Score"].append(ans_score)
+ answer_info[parent]["Id"].append(ans_id)
+ answer_info[parent]["Author"].append(user_id) # should fail for some deleted entries
+ answer_info[parent]["AuthorNames"].append(user_info[user_id])
+
+ if DEBUG:
+ print_dict(answer_info[parent])
+
+# don't debug and save
+if DEBUG:
+ quit()
+
+qa_keys = question_info.keys()
+if save:
+ import json
+
+ output_file = open(data_dir + "output.jsonl", "w")
+
+final_outputs = {"domain": args.stack_exchange}
+print(" ------ printing processed questions ------ ------ ------ ------ ------ ------ ")
+for k in qa_keys:
+ question_data = question_info[k]
+ if not save:
+ print(" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+ print(f"Question (id: {k}): {question_data['Body']}")
+
+ accepted_ans = question_data["AcceptedAnswerId"]
+
+ answer_data = answer_info[k]
+ metadata = question_data["metadata"]
+ date = question_data["Date"]
+ # filter for number of unique scores to be >= 2 (per paper)
+ scores = answer_data["Score"]
+ if len(np.unique(scores)) >= 2:
+ answers = []
+ for i, (text, score, ans_id, auth_name, auth_id) in enumerate(
+ zip(answer_data["Text"], scores, answer_data["Id"], answer_data["AuthorNames"], answer_data["Author"])
+ ):
+ sub_answer = {}
+ accepted = accepted_ans == ans_id
+
+ if score >= 0:
+ s = round(np.log2(1 + score))
+
+ # not documented if negative answers can be accepted, assuming no
+ if accepted: # add 1 to score if answer was accepted
+ s += 1
+ else:
+ s = -1
+
+ # print or save, *** indicates preferred answer
+ pref = ", ***" if accepted else ""
+ sub_answer["AnswerID"] = ans_id
+ sub_answer["text"] = text
+ sub_answer["pm_score"] = s
+ sub_answer["selected"] = accepted
+ sub_answer["Author"] = auth_name
+ sub_answer["AuthorID"] = auth_id
+ sub_answer["AuthorProfile"] = "https://" + se_name + "/users/" + str(auth_id)
+ answers.append(sub_answer)
+ if not save:
+ print(f"Answer (id {ans_id}, s:{s}{pref}): {text}")
+ print(" . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ")
+
+ if save:
+ json_obj = {
+ "qid": k,
+ "question": question_data["Body"],
+ "answers": answers,
+ "date": date,
+ "metadata": metadata,
+ }
+ json.dump(json_obj, output_file)
+
+print(f"finished at {time.time() - start_time}s")
+"""
+Added options/notes for scaling & changing this script
+
+Adding a dataloader to use HuggingFace Datasets
+`from datasets import load_dataset`
+-----
+
+Logs on loading 7z files:
+Example for samsum dataset::
+https://github.com/huggingface/datasets/blob/fedf891a08bfc77041d575fad6c26091bc0fce52/datasets/samsum/samsum.py#L106-L110
+-----
+
+Making a cleaner repo + dataloader out of the raw data here:
+https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/tree/main
+* move many files into folder (how to do that without loading)?
+* add data loader (see above, shouldn't be so hard)
+* figure out storage datatype of the processed data
+----
+
+Maybe consider using Beautiful Soup?
+https://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
+# list files in the raw repository
+from huggingface_hub import HfApi
+api = HfApi()
+
+se_files = api.list_repo_files("flax-sentence-embeddings/stackexchange_xml", repo_type="dataset")
+se_data_files = [f for f in se_files if "7z" in f]
+se_names = [f[:f.find(".")] for f in se_files if "7z" in f]
+se_names = [f + ".meta" if (i%2) == 0 else f for i, f in enumerate(se_names)]
+# print(se_data_files)
+
+"""
diff --git a/data_analysis/stackoverflow/h4_code/stack_exchange_process.py b/data_analysis/stackoverflow/h4_code/stack_exchange_process.py
new file mode 100644
index 0000000..11d7f31
--- /dev/null
+++ b/data_analysis/stackoverflow/h4_code/stack_exchange_process.py
@@ -0,0 +1,718 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import os
+import time
+
+from datasets import Dataset, concatenate_datasets
+
+import py7zr
+import requests
+from h4.data.utils import save_dataset_shards
+
+
+try:
+ from lxml import etree as ET
+except ImportError:
+ import xml.etree.ElementTree as ET
+
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+
+
+H4_DIR = Path(__file__).resolve().parents[3]
+# TODO: Ideally we would use PosixPath here, but it doesn't work with the way the script is implemented :)
+DATA_DIR = str(H4_DIR) + "/data/pmp-stack-exchange/"
+
+# stack exchanges we filter
+ALL_EXCHANGES = [
+ "3dprinting.meta",
+ "3dprinting",
+ "academia.meta",
+ "academia",
+ "ai.meta",
+ "ai",
+ "android.meta",
+ "android",
+ "anime.meta",
+ "anime",
+ "apple.meta",
+ "apple",
+ "arduino.meta",
+ "arduino",
+ "askubuntu",
+ "astronomy",
+ "astronomy.meta",
+ "aviation",
+ "aviation.meta",
+ "avp",
+ "avp.meta",
+ "beer",
+ "beer.meta",
+ "bicycles",
+ "bicycles.meta",
+ "bioinformatics",
+ "bioinformatics.meta",
+ "biology",
+ "biology.meta",
+ "bitcoin",
+ "bitcoin.meta",
+ "blender",
+ "blender.meta",
+ "boardgames",
+ "boardgames.meta",
+ "bricks",
+ "bricks.meta",
+ "buddhism",
+ "buddhism.meta",
+ "cardano",
+ "cardano.meta",
+ "chemistry",
+ "chemistry.meta",
+ "chess",
+ "chess.meta",
+ "chinese",
+ "chinese.meta",
+ "christianity",
+ "christianity.meta",
+ "civicrm",
+ "civicrm.meta",
+ "codegolf",
+ "codegolf.meta",
+ "codereview",
+ "codereview.meta",
+ "coffee",
+ "coffee.meta",
+ "cogsci",
+ "cogsci.meta",
+ "computergraphics",
+ "computergraphics.meta",
+ "conlang",
+ "conlang.meta",
+ "cooking",
+ "cooking.meta",
+ "craftcms",
+ "craftcms.meta",
+ "crafts",
+ "crafts.meta",
+ "crypto",
+ "crypto.meta",
+ "cs",
+ "cs.meta",
+ "cseducators",
+ "cseducators.meta",
+ "cstheory",
+ "cstheory.meta",
+ "datascience",
+ "datascience.meta",
+ "dba",
+ "dba.meta",
+ "devops",
+ "devops.meta",
+ "diy",
+ "diy.meta",
+ "drones",
+ "drones.meta",
+ "drupal",
+ "drupal.meta",
+ "dsp",
+ "dsp.meta",
+ "earthscience",
+ "earthscience.meta",
+ "ebooks",
+ "ebooks.meta",
+ "economics",
+ "economics.meta",
+ "electronics",
+ "electronics.meta",
+ "elementaryos",
+ "elementaryos.meta",
+ "ell",
+ "ell.meta",
+ "emacs",
+ "emacs.meta",
+ "engineering",
+ "engineering.meta",
+ "english",
+ "english.meta",
+ "eosio",
+ "eosio.meta",
+ "esperanto",
+ "esperanto.meta",
+ "ethereum",
+ "ethereum.meta",
+ "expatriates",
+ "expatriates.meta",
+ "expressionengine",
+ "expressionengine.meta",
+ "fitness",
+ "fitness.meta",
+ "freelancing",
+ "freelancing.meta",
+ "french",
+ "french.meta",
+ "gamedev",
+ "gamedev.meta",
+ "gaming",
+ "gaming.meta",
+ "gardening",
+ "gardening.meta",
+ "genealogy",
+ "genealogy.meta",
+ "german",
+ "german.meta",
+ "gis",
+ "gis.meta",
+ "graphicdesign",
+ "graphicdesign.meta",
+ "ham",
+ "ham.meta",
+ "hardwarerecs",
+ "hardwarerecs.meta",
+ "health",
+ "health.meta",
+ "hermeneutics",
+ "hermeneutics.meta",
+ "hinduism",
+ "hinduism.meta",
+ "history",
+ "history.meta",
+ "homebrew",
+ "homebrew.meta",
+ "hsm",
+ "hsm.meta",
+ "interpersonal",
+ "interpersonal.meta",
+ "iot",
+ "iot.meta",
+ "iota",
+ "iota.meta",
+ "islam",
+ "islam.meta",
+ "italian",
+ "italian.meta",
+ "japanese",
+ "japanese.meta",
+ "joomla",
+ "joomla.meta",
+ "judaism",
+ "judaism.meta",
+ "korean",
+ "korean.meta",
+ "languagelearning",
+ "languagelearning.meta",
+ "latin",
+ "latin.meta",
+ "law",
+ "law.meta",
+ "lifehacks",
+ "lifehacks.meta",
+ "linguistics",
+ "linguistics.meta",
+ "literature",
+ "literature.meta",
+ "magento",
+ "magento.meta",
+ "martialarts",
+ "martialarts.meta",
+ "materials",
+ "materials.meta",
+ "math",
+ "math.meta",
+ "matheducators",
+ "matheducators.meta",
+ "mathematica",
+ "mathematica.meta",
+ "mathoverflow",
+ "mechanics.meta",
+ "mechanics",
+ "meta.askubuntu",
+ "meta.mathoverflow",
+ "meta.serverfault",
+ "meta.stackexchange",
+ "meta.stackoverflow",
+ "meta.superuser",
+ "moderators.meta",
+ "moderators",
+ "monero.meta",
+ "monero",
+ "money.meta",
+ "money",
+ "movies.meta",
+ "movies",
+ "music.meta",
+ "music",
+ "musicfans.meta",
+ "musicfans",
+ "mythology.meta",
+ "mythology",
+ "networkengineering.meta",
+ "networkengineering",
+ "opendata.meta",
+ "opendata",
+ "opensource.meta",
+ "opensource",
+ "or.meta",
+ "or",
+ "outdoors.meta",
+ "outdoors",
+ "parenting.meta",
+ "parenting",
+ "patents.meta",
+ "patents",
+ "pets.meta",
+ "pets",
+ "philosophy.meta",
+ "philosophy",
+ "photo.meta",
+ "photo",
+ "physics.meta",
+ "physics",
+ "pm.meta",
+ "pm",
+ "poker.meta",
+ "poker",
+ "politics.meta",
+ "politics",
+ "portuguese.meta",
+ "portuguese",
+ "puzzling.meta",
+ "puzzling",
+ "quant.meta",
+ "quant",
+ "quantumcomputing.meta",
+ "quantumcomputing",
+ "raspberrypi.meta",
+ "raspberrypi",
+ "retrocomputing.meta",
+ "retrocomputing",
+ "reverseengineering.meta",
+ "reverseengineering",
+ "robotics.meta",
+ "robotics",
+ "rpg.meta",
+ "rpg",
+ "rus.meta",
+ "rus",
+ "russian.meta",
+ "russian",
+ "salesforce.meta",
+ "salesforce",
+ "scicomp.meta",
+ "scicomp",
+ "scifi.meta",
+ "scifi",
+ "security.meta",
+ "security",
+ "serverfault",
+ "sharepoint",
+ "sharepoint.meta",
+ "sitecore",
+ "sitecore.meta",
+ "skeptics",
+ "skeptics.meta",
+ "softwareengineering",
+ "softwareengineering.meta",
+ "softwarerecs",
+ "softwarerecs.meta",
+ "sound",
+ "sound.meta",
+ "space",
+ "space.meta",
+ "spanish",
+ "spanish.meta",
+ "sports",
+ "sports.meta",
+ "sqa",
+ "sqa.meta",
+ "stackapps",
+ "stats.meta",
+ "stats",
+ "stellar.meta",
+ "stellar",
+ "superuser",
+ "sustainability",
+ "sustainability.meta",
+ "tex",
+ "tex.meta",
+ "tezos",
+ "tezos.meta",
+ "tor",
+ "tor.meta",
+ "travel",
+ "travel.meta",
+ "tridion",
+ "tridion.meta",
+ "ukrainian",
+ "ukrainian.meta",
+ "unix",
+ "unix.meta",
+ "ux",
+ "ux.meta",
+ "vegetarianism",
+ "vegetarianism.meta",
+ "vi",
+ "vi.meta",
+ "webapps",
+ "webapps.meta",
+ "webmasters",
+ "webmasters.meta",
+ "windowsphone",
+ "windowsphone.meta",
+ "woodworking",
+ "woodworking.meta",
+ "wordpress",
+ "wordpress.meta",
+ "workplace",
+ "workplace.meta",
+ "worldbuilding",
+ "worldbuilding.meta",
+ "writers",
+ "writers.meta",
+ "Stackoverflow", # hardcoded for different URL structure
+]
+
+# Some excluded stack exchanges below (not a maintained list)
+# spanish: es.meta.stackoverflow.com.7z, es.stackoverflow.com.7z
+# japanese: ja.meta.stackoverflow.com.7z, ja.stackoverflow.com.7z
+# some language: pt.stackoverflow.com, pt.meta.stackoverflow.com
+# ru.stackoverflow, ru.meta.stackoverflow
+
+# stack exchanges with different processing, these end in .net ;(
+DOTNET_LIST = ["mathoverflow", "meta.mathoverflow"]
+
+# stack exchanges without .stackoverflow.com (includes above)
+SHORT_URL_LIST = [
+ "askubuntu",
+ "meta.askubuntu",
+ "meta.serverfault",
+ "meta.stackexchange",
+ "meta.stackoverflow",
+ "stackexchange",
+ "superuser",
+ "meta.superuser",
+ "serverfault",
+ "stackapps",
+ "Stackoverflow",
+]
+SHORT_URL_LIST += DOTNET_LIST
+
+
+def get_and_unpack_7z(directory: str, data_save_dir: str, save_dir_override: str = None):
+ # check if unpacked data exists (no need to re-download):
+ se_name_7z = directory[directory.rfind("/") + 1 :]
+ se_name = se_name_7z[:-3]
+ assert ".7z" == se_name_7z[-3:]
+ if not os.path.exists(data_save_dir + se_name_7z):
+ print("Loading raw data, this can take a second!")
+
+ ex_data_url = (
+ # "https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml/resolve/main/"\
+ "https://archive.org/download/stackexchange/"
+ + se_name_7z
+ )
+
+ response = requests.get(ex_data_url, allow_redirects=True)
+ filename = os.path.basename(ex_data_url)
+
+ print("Unpacking raw data.")
+ if response.status_code == 200:
+ with open(DATA_DIR + filename, "wb") as out:
+ out.write(response.content)
+ os.mkdir(DATA_DIR + se_name)
+ with py7zr.SevenZipFile(DATA_DIR + filename, "r") as archive:
+ if save_dir_override:
+ save_dir = save_dir_override
+ else:
+ save_dir = se_name
+ archive.extractall(DATA_DIR + save_dir + "/")
+ else:
+ print("Request failed: %d" % response.status_code)
+
+ print("Loaded & unpacked data, now processing...")
+ else:
+ print("Raw 7z data already exists for this dir :)")
+
+
+def print_dict(d):
+ for key, val in d.items():
+ print(f"{key}, {val}")
+
+
+def simplify_date(date_string):
+ date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S")
+ return date.strftime("%Y/%m/%d")
+
+
+if __name__ == "__main__":
+ parser = ArgumentParser()
+ parser.add_argument(
+ "--all",
+ action="store_true",
+ help="If the script will process all stack exchanges: warning, requires large amount of RAM",
+ )
+ parser.add_argument("--save_path", default=DATA_DIR, type=str, help="Path to the huggingface dataset preferably.")
+ parser.add_argument(
+ "--start_idx",
+ default=0,
+ type=int,
+ help="Optional value to skip a number of exchanges in the above list if processing crashed midway",
+ )
+ parser.add_argument("--shard_size", default=100, type=int, help="Maximum size of file for subsets of data in MB")
+ parser.add_argument("--debug", action="store_true", help="Added print statements for debugging")
+ parser.set_defaults(debug=False, all=False)
+
+ args = parser.parse_args()
+
+ shard_size = str(args.shard_size) + "MB"
+ process_all = args.all
+ save_path = args.save_path
+ start_idx = args.start_idx
+ DEBUG = args.debug
+ if process_all:
+ se_list = ALL_EXCHANGES
+ else:
+ print("Run from command line with --all=True to process all data")
+ se_list = ["ai", "apple", "pets", "ai.meta"]
+
+ os.makedirs(DATA_DIR, exist_ok=True)
+
+ # Process all exchanges in loop (saves in memory)
+ TOTAL = len(se_list) - 1
+ for i, se_sub_name in enumerate(se_list[start_idx:]):
+ print(f"SECTION {i + start_idx}/{TOTAL}: {se_sub_name} - START")
+
+ # some stack exchanges don't use .stackexchange.com
+ if se_sub_name not in SHORT_URL_LIST:
+ se_full_name = se_sub_name + ".stackexchange.com"
+ elif se_sub_name in DOTNET_LIST: # two exchanges need .net
+ se_full_name = se_sub_name + ".net"
+ else:
+ se_full_name = se_sub_name + ".com"
+
+ start_time = time.time()
+ full_section_data = []
+
+ # https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z
+ # https://archive.org/download/stackexchange/Stackoverflow.com-Users.7z
+
+ # get_and_unpack_7z()
+ ex_data_file = DATA_DIR + se_full_name + "/Users.xml"
+ # check if unpacked data exists:
+ if not os.path.exists(ex_data_file):
+ # get raw data
+ ex_data_file_7z = se_full_name + ".7z"
+ if "Stackoverflow.com" in ex_data_file_7z:
+ base_stackoverflow_dir = ex_data_file_7z[:-3]
+ get_and_unpack_7z(
+ base_stackoverflow_dir + "-Posts.7z", DATA_DIR, save_dir_override="stackoverflow.com"
+ )
+ get_and_unpack_7z(
+ base_stackoverflow_dir.lower() + "-Users.7z", DATA_DIR, save_dir_override="stackoverflow.com"
+ ) # users dir only is lowercase s
+ else:
+ get_and_unpack_7z(ex_data_file_7z, DATA_DIR)
+
+ # load extracted xml files
+ local_path = (
+ DATA_DIR + se_full_name.lower() + "/"
+ ) # "ai.stackexchange.com/" # again, .lower() for the Stackexchange.com/Users
+ posts_subpath = "Posts.xml"
+ users_subpath = "Users.xml"
+
+ """
+ XML file structure:
+ * PostTypeID ranges from 1: Question, 2: Answer, ....
+ * We only want posts with AcceptedAnswerId fields
+ (docs https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede)
+ """
+
+ user_info = {-1: "(user-deleted)"}
+ question_info = {}
+ answer_info = {}
+
+ # extract user data for license
+ with open(local_path + users_subpath, "rb") as f: # Users file
+ tree = ET.parse(f)
+ for exchange in tree.iter("row"):
+ tag = int(exchange.attrib["Id"])
+ user_info[tag] = str(exchange.attrib["DisplayName"])
+
+ if DEBUG:
+ print_dict(user_info)
+
+ with open(local_path + posts_subpath, "rb") as f: # Posts file
+ tree = ET.parse(f)
+
+ # process questions, find answers next
+ # note, could do this all in one loop and store anything is memory is cheaper than processing speed
+
+ # iterator through all rows
+ for exchange in tree.iter("row"):
+ # find 2+ answers
+ if "AnswerCount" in exchange.attrib:
+ ans_count = int(exchange.attrib["AnswerCount"])
+
+ # only save questions with >= 2 answers
+ if ans_count >= 2:
+ tag = int(exchange.attrib["Id"])
+
+ result = {}
+ result["Body"] = exchange.attrib["Body"]
+
+ # store some metadata
+ result["AnswerCount"] = ans_count
+ result["PostScore"] = int(exchange.attrib["Score"])
+
+ # save metadata
+ if "OwnerUserId" in exchange.attrib:
+ user_id = int(exchange.attrib["OwnerUserId"])
+ else:
+ user_id = -1 # deleted user redirect to community page
+
+ result["Author"] = user_id # should fail for some deleted entries
+ result["metadata"] = [
+ "https://" + se_full_name + "/questions/" + str(tag), # question URL
+ "https://" + se_full_name, # Exchange URL
+ "https://"
+ + se_full_name
+ + "/users/"
+ + str(user_id)
+ + "/", # Author URL -- don't include username afterwards to avoid case with spaces in name (string regex problem)
+ ]
+ result["Date"] = simplify_date(exchange.attrib["CreationDate"])
+
+ # if accepted answer, store it
+ if "AcceptedAnswerId" in exchange.attrib:
+ accepted_ans = int(exchange.attrib["AcceptedAnswerId"])
+ result["AcceptedAnswerId"] = accepted_ans
+ else:
+ result["AcceptedAnswerId"] = None
+
+ question_info[tag] = result
+ if DEBUG:
+ print_dict(question_info[tag])
+
+ # process looking for answers
+ for exchange in tree.iter("row"):
+ # answers are ID type 2
+ if int(exchange.attrib["PostTypeId"]) == 2:
+ # get parent, check if in question_info
+ parent = int(exchange.attrib["ParentId"])
+ # note, that parent will be same as tag above in answer_info and question_info
+
+ # log if parent is in questions (multiple answers for preference model)
+ if parent in question_info:
+ # info for answers
+ ans_text = exchange.attrib["Body"]
+ ans_score = int(exchange.attrib["Score"])
+ ans_id = int(exchange.attrib["Id"]) # extra score if this ID matches accept id above
+
+ # save metadata
+ if "OwnerUserId" in exchange.attrib:
+ user_id = int(exchange.attrib["OwnerUserId"])
+ else:
+ user_id = -1 # deleted user
+ # we'll need to store multiple answers per tag
+ if parent not in answer_info:
+ answer_info[parent] = {}
+ answer_info[parent]["Text"] = []
+ answer_info[parent]["Score"] = []
+ answer_info[parent]["Id"] = []
+ answer_info[parent]["Author"] = []
+ answer_info[parent]["AuthorNames"] = []
+
+ answer_info[parent]["Text"].append(ans_text)
+ answer_info[parent]["Score"].append(ans_score)
+ answer_info[parent]["Id"].append(ans_id)
+ answer_info[parent]["Author"].append(user_id) # should fail for some deleted entries
+ # fix rare case that the username for answer authors is not in the database
+ if user_id in user_info:
+ username = user_info[user_id]
+ else:
+ username = "(user-not-found)"
+ answer_info[parent]["AuthorNames"].append(username)
+
+ if DEBUG:
+ print_dict(answer_info[parent])
+
+ qa_keys = question_info.keys()
+
+ final_outputs = {"domain": se_sub_name}
+
+ for k in qa_keys:
+ question_data = question_info[k]
+
+ accepted_ans = question_data["AcceptedAnswerId"]
+
+ answer_data = answer_info[k]
+ metadata = question_data["metadata"]
+ date = question_data["Date"]
+
+ # filter for number of unique scores to be >= 2 (per paper)
+ scores = answer_data["Score"]
+ if len(np.unique(scores)) >= 2:
+ answers = []
+ for text, score, ans_id, auth_name, auth_id in zip(
+ answer_data["Text"], scores, answer_data["Id"], answer_data["AuthorNames"], answer_data["Author"]
+ ):
+ sub_answer = {}
+ accepted = accepted_ans == ans_id
+
+ if score >= 0:
+ s = round(np.log2(1 + score))
+
+ # not documented if negative answers can be accepted, assuming no
+ if accepted: # add 1 to score if answer was accepted
+ s += 1
+ else:
+ s = -1
+
+ sub_answer["answer_id"] = ans_id
+ sub_answer["text"] = text
+ sub_answer["pm_score"] = s
+ sub_answer["selected"] = accepted
+ sub_answer["author"] = auth_name
+ sub_answer["author_id"] = auth_id
+ sub_answer["author_profile"] = "https://" + se_full_name + "/users/" + str(auth_id)
+ answers.append(sub_answer)
+
+ json_obj = {
+ "qid": k,
+ "question": question_data["Body"],
+ "answers": answers,
+ "date": date,
+ "metadata": metadata,
+ }
+ full_section_data.append(json_obj)
+
+ print(f"finished section {se_full_name} at {time.time() - start_time}s")
+
+ if not DEBUG:
+ sublist_len = 100000
+
+ # bypass known issue in arrow https://issues.apache.org/jira/browse/ARROW-17137
+ if len(full_section_data) > sublist_len:
+ print(f"Processed dataset length > {sublist_len}, processing to HF dataset in chunks")
+ chunks = [
+ full_section_data[x : x + sublist_len] for x in range(0, len(full_section_data), sublist_len)
+ ]
+ ds_chunks = [Dataset.from_list(ch) for ch in chunks]
+ ds = concatenate_datasets(ds_chunks)
+ else:
+ ds = Dataset.from_list(full_section_data)
+
+ save_dataset_shards(ds, save_path, subset=se_full_name, shard_size=shard_size)
diff --git a/data_analysis/stackoverflow/other/main.py b/data_analysis/stackoverflow/other/main.py
new file mode 100644
index 0000000..65a88bb
--- /dev/null
+++ b/data_analysis/stackoverflow/other/main.py
@@ -0,0 +1,195 @@
+# Inspired by https://github.com/huggingface/h4/blob/main/scripts/data/pmp/stack_exchange_process.py
+import datetime
+import os
+import time
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+from datasets import Dataset, concatenate_datasets
+from tqdm import tqdm
+
+# Note: Using rclone + py7zr in command line is often faster than this
+import py7zr
+import requests
+
+# If the cleaning becomes a bottleneck at some point, could be better to use
+# this snippet from Anton https://gist.github.com/anton-l/4bfafb42878a8e77b20f3b844d9cae36
+# (uses selectolax, faster than bs4) instead.
+from bs4 import BeautifulSoup
+from se_reference_utils import ALL_EXCHANGES
+
+
+DATA_DIR = "data/stack-exchange"
+WTOKEN = os.getenv("WTOKEN")
+
+
+def simplify_date(date_string):
+ date = datetime.datetime.strptime(date_string.split(".")[0], "%Y-%m-%dT%H:%M:%S")
+ return date.strftime("%Y/%m/%d")
+
+
+def download_and_extract_se7z(name: str, directory: str, data_save_dir: str, save_dir_override: str = None):
+ # Downloading 7z file
+ if os.path.exists(f"{data_save_dir}/{name}.7z"):
+ print("Raw 7z data already exists for this dir.")
+ else:
+ print("Downloading compressed data.")
+
+ ex_data_url = f"https://archive.org/download/stackexchange/{directory}"
+ response = requests.get(ex_data_url, allow_redirects=True)
+
+ if response.status_code != 200:
+ raise ConnectionError(f"Request failed: {response.status_code} for subset: {name}, url: {ex_data_url}")
+
+ print("Unpacking raw data.")
+ with open(f"{DATA_DIR}/{name}.7z", "wb") as out:
+ out.write(response.content)
+
+ os.mkdir(f"{DATA_DIR}/{name}")
+ with py7zr.SevenZipFile(f"{DATA_DIR}/{name}.7z", "r") as archive:
+ save_dir = save_dir_override if save_dir_override is not None else name
+ archive.extractall(f"{DATA_DIR}/{save_dir}/")
+
+ print(f"{name} successfully extracted.")
+
+
+def get_question_from_html(exchange):
+ question = {}
+ keys_of_interest = ["Id", "Body", "AnswerCount", "OwnerUserId", "PostScore", "Date", "AcceptedAnswerId"]
+ for key in keys_of_interest:
+ try:
+ if key in ["Id", "AnswerCount", "PostScore", "AcceptedAnswerId", "OwnerUserId"]:
+ question[key] = int(exchange.attrib[key])
+ elif key == "Date":
+ question[key] = simplify_date(exchange.attrib["CreationDate"])
+ elif key == "Body":
+ question[key] = exchange.attrib[key]
+ question["text"] = BeautifulSoup(exchange.attrib[key], "lxml").text
+ else:
+ question[key] = exchange.attrib[key]
+ except KeyError:
+ # deleted user redirect to community page > -1
+ question[key] = -1 if key == "OwnerUserId" else None
+
+ question["metadata"] = [
+ f"https://{se_sub_url}/questions/{str(question['Id'])}", # question URL
+ f"https://{se_sub_url}", # Exchange URL
+ f"https://{se_sub_url}/users/{str(question['OwnerUserId'])}/", # Author URL
+ ]
+
+ return question["Id"], question
+
+
+def get_answer_from_html(exchange):
+ # We connect answers to their parent's id
+ parent_id = int(exchange.attrib["ParentId"])
+
+ answer = {}
+ keys_of_interest = ["Body", "Score", "Id", "OwnerUserId"]
+ for key in keys_of_interest:
+ try:
+ if key in ["Score", "Id", "OwnerUserId"]:
+ answer[key] = int(exchange.attrib[key])
+ elif key == "Body":
+ answer[key] = exchange.attrib[key]
+ answer["text"] = BeautifulSoup(exchange.attrib[key], "lxml").text
+ else:
+ answer[key] = exchange.attrib[key]
+ except KeyError:
+ answer[key] = -1 if key == "OwnerUserId" else None
+
+ return parent_id, answer
+
+
+def get_posts_from_html(se_sub_name):
+ extracted_info = defaultdict(lambda: {"question": None, "answers": list()})
+ with open(f"{DATA_DIR}/{se_sub_name}/Posts.xml", "rb") as f:
+ tree = ET.parse(f)
+
+ for exchange in tree.iter("row"):
+ post_type = int(exchange.attrib["PostTypeId"])
+
+ if post_type == 1: # Question
+ if int(exchange.attrib["AnswerCount"]) > 0:
+ tag, question = get_question_from_html(exchange)
+ extracted_info[tag]["question"] = question
+
+ elif post_type == 2: # Answer
+ tag, answer = get_answer_from_html(exchange)
+ extracted_info[tag]["answers"].append(answer)
+ return extracted_info
+
+
+def get_jsonlines_from_posts(extracted_info):
+ result_jsonlines = []
+ for tag, data in extracted_info.items():
+ # Sorting answers by score (see LLAMA paper), and only keep positively scored ones
+ question = data["question"]
+ answers = [a for a in sorted(data["answers"], key=lambda x: x["Score"]) if a["Score"] > 0]
+
+ # We skip empty questions or answers
+ if question is None or len(answers) < 1:
+ continue
+
+ text = f"user{question['OwnerUserId']}: {question['text']}"
+ for answer in answers:
+ text += f"\nuser{answer['OwnerUserId']}: {answer['text']}"
+
+ result = {
+ "question_id": question["Id"],
+ "text": text,
+ "metadata": question["metadata"],
+ "date": question["Date"],
+ "original_text": [f"{item['OwnerUserId']}: {item['Body']}" for item in [question] + answers],
+ }
+ result_jsonlines.append(result)
+ return result_jsonlines
+
+
+def upload_to_hub(result_jsonlines):
+ size = len(result_jsonlines)
+ chunk_size = 100000
+ if size > chunk_size:
+ chunks = [
+ Dataset.from_list(result_jsonlines[i : min(i + chunk_size, size)]) for i in range(0, size, chunk_size)
+ ]
+ dataset = concatenate_datasets(chunks)
+ else:
+ dataset = Dataset.from_list(result_jsonlines)
+
+ dataset.push_to_hub("HuggingFaceGECLM/StackExchange_Mar2023", split=se_sub_name, private=True, token=WTOKEN)
+
+
+def main(se_sub_name, se_sub_url):
+ print(f"{se_sub_name} at {se_sub_url}.")
+ start_time = time.time()
+
+ # Download and extract
+ if not os.path.exists(f"{DATA_DIR}/{se_sub_name}/Posts.xml"):
+ if "se_sub_name" == "stackoverflow":
+ # Note: we'll also need -Users.7z if we want to filter on licenses at some point
+ download_and_extract_se7z(
+ se_sub_name, f"{se_sub_url}-Posts.7z", DATA_DIR, save_dir_override="stackoverflow.com"
+ )
+ else:
+ download_and_extract_se7z(se_sub_name, f"{se_sub_url}.7z", DATA_DIR)
+
+ # Selects posts from HTML tree (Questions and answers)
+ extracted_info = get_posts_from_html(se_sub_name)
+ print("Posts parsed from HTML.")
+
+ # Create json from posts
+ result_jsonlines = get_jsonlines_from_posts(extracted_info)
+
+ print(f"Finished {se_sub_url} in {time.time() - start_time}s. Contains {len(result_jsonlines)} lines.")
+
+ # Saves to the hub
+ upload_to_hub(result_jsonlines)
+
+
+if __name__ == "__main__":
+ os.makedirs(DATA_DIR, exist_ok=True)
+
+ # Process all exchanges in a loop - could be easily launched in parallel
+ for se_sub_name, se_sub_url in tqdm(ALL_EXCHANGES.items()):
+ main(se_sub_name, se_sub_url)
diff --git a/data_analysis/stackoverflow/other/requirements.txt b/data_analysis/stackoverflow/other/requirements.txt
new file mode 100644
index 0000000..80efabe
--- /dev/null
+++ b/data_analysis/stackoverflow/other/requirements.txt
@@ -0,0 +1,5 @@
+datasets
+py7zr
+requests
+tqdm
+bs4
\ No newline at end of file
diff --git a/data_analysis/stackoverflow/other/se_reference_utils.py b/data_analysis/stackoverflow/other/se_reference_utils.py
new file mode 100644
index 0000000..a9cc434
--- /dev/null
+++ b/data_analysis/stackoverflow/other/se_reference_utils.py
@@ -0,0 +1,347 @@
+ALL_EXCHANGES = {
+ "3dprinting.meta": "3dprinting.meta.stackexchange.com",
+ "3dprinting": "3dprinting.stackexchange.com",
+ "academia.meta": "academia.meta.stackexchange.com",
+ "academia": "academia.stackexchange.com",
+ "ai.meta": "ai.meta.stackexchange.com",
+ "ai": "ai.stackexchange.com",
+ "android.meta": "android.meta.stackexchange.com",
+ "android": "android.stackexchange.com",
+ "anime.meta": "anime.meta.stackexchange.com",
+ "anime": "anime.stackexchange.com",
+ "apple.meta": "apple.meta.stackexchange.com",
+ "apple": "apple.stackexchange.com",
+ "arduino.meta": "arduino.meta.stackexchange.com",
+ "arduino": "arduino.stackexchange.com",
+ "askubuntu": "askubuntu.com",
+ "astronomy": "astronomy.stackexchange.com",
+ "astronomy.meta": "astronomy.meta.stackexchange.com",
+ "aviation": "aviation.stackexchange.com",
+ "aviation.meta": "aviation.meta.stackexchange.com",
+ "avp": "avp.stackexchange.com",
+ "avp.meta": "avp.meta.stackexchange.com",
+ "beer": "beer.stackexchange.com",
+ "beer.meta": "beer.meta.stackexchange.com",
+ "bicycles": "bicycles.stackexchange.com",
+ "bicycles.meta": "bicycles.meta.stackexchange.com",
+ "bioinformatics": "bioinformatics.stackexchange.com",
+ "bioinformatics.meta": "bioinformatics.meta.stackexchange.com",
+ "biology": "biology.stackexchange.com",
+ "biology.meta": "biology.meta.stackexchange.com",
+ "bitcoin": "bitcoin.stackexchange.com",
+ "bitcoin.meta": "bitcoin.meta.stackexchange.com",
+ "blender": "blender.stackexchange.com",
+ "blender.meta": "blender.meta.stackexchange.com",
+ "boardgames": "boardgames.stackexchange.com",
+ "boardgames.meta": "boardgames.meta.stackexchange.com",
+ "bricks": "bricks.stackexchange.com",
+ "bricks.meta": "bricks.meta.stackexchange.com",
+ "buddhism": "buddhism.stackexchange.com",
+ "buddhism.meta": "buddhism.meta.stackexchange.com",
+ "cardano": "cardano.stackexchange.com",
+ "cardano.meta": "cardano.meta.stackexchange.com",
+ "chemistry": "chemistry.stackexchange.com",
+ "chemistry.meta": "chemistry.meta.stackexchange.com",
+ "chess": "chess.stackexchange.com",
+ "chess.meta": "chess.meta.stackexchange.com",
+ "chinese": "chinese.stackexchange.com",
+ "chinese.meta": "chinese.meta.stackexchange.com",
+ "christianity": "christianity.stackexchange.com",
+ "christianity.meta": "christianity.meta.stackexchange.com",
+ "civicrm": "civicrm.stackexchange.com",
+ "civicrm.meta": "civicrm.meta.stackexchange.com",
+ "codegolf": "codegolf.stackexchange.com",
+ "codegolf.meta": "codegolf.meta.stackexchange.com",
+ "codereview": "codereview.stackexchange.com",
+ "codereview.meta": "codereview.meta.stackexchange.com",
+ "coffee": "coffee.stackexchange.com",
+ "coffee.meta": "coffee.meta.stackexchange.com",
+ "cogsci": "cogsci.stackexchange.com",
+ "cogsci.meta": "cogsci.meta.stackexchange.com",
+ "computergraphics": "computergraphics.stackexchange.com",
+ "computergraphics.meta": "computergraphics.meta.stackexchange.com",
+ "conlang": "conlang.stackexchange.com",
+ "conlang.meta": "conlang.meta.stackexchange.com",
+ "cooking": "cooking.stackexchange.com",
+ "cooking.meta": "cooking.meta.stackexchange.com",
+ "craftcms": "craftcms.stackexchange.com",
+ "craftcms.meta": "craftcms.meta.stackexchange.com",
+ "crafts": "crafts.stackexchange.com",
+ "crafts.meta": "crafts.meta.stackexchange.com",
+ "crypto": "crypto.stackexchange.com",
+ "crypto.meta": "crypto.meta.stackexchange.com",
+ "cs": "cs.stackexchange.com",
+ "cs.meta": "cs.meta.stackexchange.com",
+ "cseducators": "cseducators.stackexchange.com",
+ "cseducators.meta": "cseducators.meta.stackexchange.com",
+ "cstheory": "cstheory.stackexchange.com",
+ "cstheory.meta": "cstheory.meta.stackexchange.com",
+ "datascience": "datascience.stackexchange.com",
+ "datascience.meta": "datascience.meta.stackexchange.com",
+ "dba": "dba.stackexchange.com",
+ "dba.meta": "dba.meta.stackexchange.com",
+ "devops": "devops.stackexchange.com",
+ "devops.meta": "devops.meta.stackexchange.com",
+ "diy": "diy.stackexchange.com",
+ "diy.meta": "diy.meta.stackexchange.com",
+ "drones": "drones.stackexchange.com",
+ "drones.meta": "drones.meta.stackexchange.com",
+ "drupal": "drupal.stackexchange.com",
+ "drupal.meta": "drupal.meta.stackexchange.com",
+ "dsp": "dsp.stackexchange.com",
+ "dsp.meta": "dsp.meta.stackexchange.com",
+ "earthscience": "earthscience.stackexchange.com",
+ "earthscience.meta": "earthscience.meta.stackexchange.com",
+ "ebooks": "ebooks.stackexchange.com",
+ "ebooks.meta": "ebooks.meta.stackexchange.com",
+ "economics": "economics.stackexchange.com",
+ "economics.meta": "economics.meta.stackexchange.com",
+ "electronics": "electronics.stackexchange.com",
+ "electronics.meta": "electronics.meta.stackexchange.com",
+ "elementaryos": "elementaryos.stackexchange.com",
+ "elementaryos.meta": "elementaryos.meta.stackexchange.com",
+ "ell": "ell.stackexchange.com",
+ "ell.meta": "ell.meta.stackexchange.com",
+ "emacs": "emacs.stackexchange.com",
+ "emacs.meta": "emacs.meta.stackexchange.com",
+ "engineering": "engineering.stackexchange.com",
+ "engineering.meta": "engineering.meta.stackexchange.com",
+ "english": "english.stackexchange.com",
+ "english.meta": "english.meta.stackexchange.com",
+ "eosio": "eosio.stackexchange.com",
+ "eosio.meta": "eosio.meta.stackexchange.com",
+ "esperanto": "esperanto.stackexchange.com",
+ "esperanto.meta": "esperanto.meta.stackexchange.com",
+ "ethereum": "ethereum.stackexchange.com",
+ "ethereum.meta": "ethereum.meta.stackexchange.com",
+ "expatriates": "expatriates.stackexchange.com",
+ "expatriates.meta": "expatriates.meta.stackexchange.com",
+ "expressionengine": "expressionengine.stackexchange.com",
+ "expressionengine.meta": "expressionengine.meta.stackexchange.com",
+ "fitness": "fitness.stackexchange.com",
+ "fitness.meta": "fitness.meta.stackexchange.com",
+ "freelancing": "freelancing.stackexchange.com",
+ "freelancing.meta": "freelancing.meta.stackexchange.com",
+ "french": "french.stackexchange.com",
+ "french.meta": "french.meta.stackexchange.com",
+ "gamedev": "gamedev.stackexchange.com",
+ "gamedev.meta": "gamedev.meta.stackexchange.com",
+ "gaming": "gaming.stackexchange.com",
+ "gaming.meta": "gaming.meta.stackexchange.com",
+ "gardening": "gardening.stackexchange.com",
+ "gardening.meta": "gardening.meta.stackexchange.com",
+ "genealogy": "genealogy.stackexchange.com",
+ "genealogy.meta": "genealogy.meta.stackexchange.com",
+ "german": "german.stackexchange.com",
+ "german.meta": "german.meta.stackexchange.com",
+ "gis": "gis.stackexchange.com",
+ "gis.meta": "gis.meta.stackexchange.com",
+ "graphicdesign": "graphicdesign.stackexchange.com",
+ "graphicdesign.meta": "graphicdesign.meta.stackexchange.com",
+ "ham": "ham.stackexchange.com",
+ "ham.meta": "ham.meta.stackexchange.com",
+ "hardwarerecs": "hardwarerecs.stackexchange.com",
+ "hardwarerecs.meta": "hardwarerecs.meta.stackexchange.com",
+ "health": "health.stackexchange.com",
+ "health.meta": "health.meta.stackexchange.com",
+ "hermeneutics": "hermeneutics.stackexchange.com",
+ "hermeneutics.meta": "hermeneutics.meta.stackexchange.com",
+ "hinduism": "hinduism.stackexchange.com",
+ "hinduism.meta": "hinduism.meta.stackexchange.com",
+ "history": "history.stackexchange.com",
+ "history.meta": "history.meta.stackexchange.com",
+ "homebrew": "homebrew.stackexchange.com",
+ "homebrew.meta": "homebrew.meta.stackexchange.com",
+ "hsm": "hsm.stackexchange.com",
+ "hsm.meta": "hsm.meta.stackexchange.com",
+ "interpersonal": "interpersonal.stackexchange.com",
+ "interpersonal.meta": "interpersonal.meta.stackexchange.com",
+ "iot": "iot.stackexchange.com",
+ "iot.meta": "iot.meta.stackexchange.com",
+ "iota": "iota.stackexchange.com",
+ "iota.meta": "iota.meta.stackexchange.com",
+ "islam": "islam.stackexchange.com",
+ "islam.meta": "islam.meta.stackexchange.com",
+ "italian": "italian.stackexchange.com",
+ "italian.meta": "italian.meta.stackexchange.com",
+ "japanese": "japanese.stackexchange.com",
+ "japanese.meta": "japanese.meta.stackexchange.com",
+ "joomla": "joomla.stackexchange.com",
+ "joomla.meta": "joomla.meta.stackexchange.com",
+ "judaism": "judaism.stackexchange.com",
+ "judaism.meta": "judaism.meta.stackexchange.com",
+ "korean": "korean.stackexchange.com",
+ "korean.meta": "korean.meta.stackexchange.com",
+ "languagelearning": "languagelearning.stackexchange.com",
+ "languagelearning.meta": "languagelearning.meta.stackexchange.com",
+ "latin": "latin.stackexchange.com",
+ "latin.meta": "latin.meta.stackexchange.com",
+ "law": "law.stackexchange.com",
+ "law.meta": "law.meta.stackexchange.com",
+ "lifehacks": "lifehacks.stackexchange.com",
+ "lifehacks.meta": "lifehacks.meta.stackexchange.com",
+ "linguistics": "linguistics.stackexchange.com",
+ "linguistics.meta": "linguistics.meta.stackexchange.com",
+ "literature": "literature.stackexchange.com",
+ "literature.meta": "literature.meta.stackexchange.com",
+ "magento": "magento.stackexchange.com",
+ "magento.meta": "magento.meta.stackexchange.com",
+ "martialarts": "martialarts.stackexchange.com",
+ "martialarts.meta": "martialarts.meta.stackexchange.com",
+ "materials": "materials.stackexchange.com",
+ "materials.meta": "materials.meta.stackexchange.com",
+ "math": "math.stackexchange.com",
+ "math.meta": "math.meta.stackexchange.com",
+ "matheducators": "matheducators.stackexchange.com",
+ "matheducators.meta": "matheducators.meta.stackexchange.com",
+ "mathematica": "mathematica.stackexchange.com",
+ "mathematica.meta": "mathematica.meta.stackexchange.com",
+ "mathoverflow": "mathoverflow.net",
+ "mechanics.meta": "mechanics.meta.stackexchange.com",
+ "mechanics": "mechanics.stackexchange.com",
+ "meta.askubuntu": "meta.askubuntu.com",
+ "meta.mathoverflow": "meta.mathoverflow.net",
+ "meta.serverfault": "meta.serverfault.com",
+ "meta.stackexchange": "meta.stackexchange.com",
+ "meta.stackoverflow": "meta.stackoverflow.com",
+ "meta.superuser": "meta.superuser.com",
+ "moderators.meta": "moderators.meta.stackexchange.com",
+ "moderators": "moderators.stackexchange.com",
+ "monero.meta": "monero.meta.stackexchange.com",
+ "monero": "monero.stackexchange.com",
+ "money.meta": "money.meta.stackexchange.com",
+ "money": "money.stackexchange.com",
+ "movies.meta": "movies.meta.stackexchange.com",
+ "movies": "movies.stackexchange.com",
+ "music.meta": "music.meta.stackexchange.com",
+ "music": "music.stackexchange.com",
+ "musicfans.meta": "musicfans.meta.stackexchange.com",
+ "musicfans": "musicfans.stackexchange.com",
+ "mythology.meta": "mythology.meta.stackexchange.com",
+ "mythology": "mythology.stackexchange.com",
+ "networkengineering.meta": "networkengineering.meta.stackexchange.com",
+ "networkengineering": "networkengineering.stackexchange.com",
+ "opendata.meta": "opendata.meta.stackexchange.com",
+ "opendata": "opendata.stackexchange.com",
+ "opensource.meta": "opensource.meta.stackexchange.com",
+ "opensource": "opensource.stackexchange.com",
+ "or.meta": "or.meta.stackexchange.com",
+ "or": "or.stackexchange.com",
+ "outdoors.meta": "outdoors.meta.stackexchange.com",
+ "outdoors": "outdoors.stackexchange.com",
+ "parenting.meta": "parenting.meta.stackexchange.com",
+ "parenting": "parenting.stackexchange.com",
+ "patents.meta": "patents.meta.stackexchange.com",
+ "patents": "patents.stackexchange.com",
+ "pets.meta": "pets.meta.stackexchange.com",
+ "pets": "pets.stackexchange.com",
+ "philosophy.meta": "philosophy.meta.stackexchange.com",
+ "philosophy": "philosophy.stackexchange.com",
+ "photo.meta": "photo.meta.stackexchange.com",
+ "photo": "photo.stackexchange.com",
+ "physics.meta": "physics.meta.stackexchange.com",
+ "physics": "physics.stackexchange.com",
+ "pm.meta": "pm.meta.stackexchange.com",
+ "pm": "pm.stackexchange.com",
+ "poker.meta": "poker.meta.stackexchange.com",
+ "poker": "poker.stackexchange.com",
+ "politics.meta": "politics.meta.stackexchange.com",
+ "politics": "politics.stackexchange.com",
+ "portuguese.meta": "portuguese.meta.stackexchange.com",
+ "portuguese": "portuguese.stackexchange.com",
+ "puzzling.meta": "puzzling.meta.stackexchange.com",
+ "puzzling": "puzzling.stackexchange.com",
+ "quant.meta": "quant.meta.stackexchange.com",
+ "quant": "quant.stackexchange.com",
+ "quantumcomputing.meta": "quantumcomputing.meta.stackexchange.com",
+ "quantumcomputing": "quantumcomputing.stackexchange.com",
+ "raspberrypi.meta": "raspberrypi.meta.stackexchange.com",
+ "raspberrypi": "raspberrypi.stackexchange.com",
+ "retrocomputing.meta": "retrocomputing.meta.stackexchange.com",
+ "retrocomputing": "retrocomputing.stackexchange.com",
+ "reverseengineering.meta": "reverseengineering.meta.stackexchange.com",
+ "reverseengineering": "reverseengineering.stackexchange.com",
+ "robotics.meta": "robotics.meta.stackexchange.com",
+ "robotics": "robotics.stackexchange.com",
+ "rpg.meta": "rpg.meta.stackexchange.com",
+ "rpg": "rpg.stackexchange.com",
+ "rus.meta": "rus.meta.stackexchange.com",
+ "rus": "rus.stackexchange.com",
+ "russian.meta": "russian.meta.stackexchange.com",
+ "russian": "russian.stackexchange.com",
+ "salesforce.meta": "salesforce.meta.stackexchange.com",
+ "salesforce": "salesforce.stackexchange.com",
+ "scicomp.meta": "scicomp.meta.stackexchange.com",
+ "scicomp": "scicomp.stackexchange.com",
+ "scifi.meta": "scifi.meta.stackexchange.com",
+ "scifi": "scifi.stackexchange.com",
+ "security.meta": "security.meta.stackexchange.com",
+ "security": "security.stackexchange.com",
+ "serverfault": "serverfault.com",
+ "sharepoint": "sharepoint.stackexchange.com",
+ "sharepoint.meta": "sharepoint.meta.stackexchange.com",
+ "sitecore": "sitecore.stackexchange.com",
+ "sitecore.meta": "sitecore.meta.stackexchange.com",
+ "skeptics": "skeptics.stackexchange.com",
+ "skeptics.meta": "skeptics.meta.stackexchange.com",
+ "softwareengineering": "softwareengineering.stackexchange.com",
+ "softwareengineering.meta": "softwareengineering.meta.stackexchange.com",
+ "softwarerecs": "softwarerecs.stackexchange.com",
+ "softwarerecs.meta": "softwarerecs.meta.stackexchange.com",
+ "sound": "sound.stackexchange.com",
+ "sound.meta": "sound.meta.stackexchange.com",
+ "space": "space.stackexchange.com",
+ "space.meta": "space.meta.stackexchange.com",
+ "spanish": "spanish.stackexchange.com",
+ "spanish.meta": "spanish.meta.stackexchange.com",
+ "sports": "sports.stackexchange.com",
+ "sports.meta": "sports.meta.stackexchange.com",
+ "sqa": "sqa.stackexchange.com",
+ "sqa.meta": "sqa.meta.stackexchange.com",
+ "stackapps": "stackapps.com",
+ # "stackexchange": "stackexchange.com",
+ "stats.meta": "stats.meta.stackexchange.com",
+ "stats": "stats.stackexchange.com",
+ "stellar.meta": "stellar.meta.stackexchange.com",
+ "stellar": "stellar.stackexchange.com",
+ "superuser": "superuser.com",
+ "sustainability": "sustainability.stackexchange.com",
+ "sustainability.meta": "sustainability.meta.stackexchange.com",
+ "tex": "tex.stackexchange.com",
+ "tex.meta": "tex.meta.stackexchange.com",
+ "tezos": "tezos.stackexchange.com",
+ "tezos.meta": "tezos.meta.stackexchange.com",
+ "tor": "tor.stackexchange.com",
+ "tor.meta": "tor.meta.stackexchange.com",
+ "travel": "travel.stackexchange.com",
+ "travel.meta": "travel.meta.stackexchange.com",
+ "tridion": "tridion.stackexchange.com",
+ "tridion.meta": "tridion.meta.stackexchange.com",
+ "ukrainian": "ukrainian.stackexchange.com",
+ "ukrainian.meta": "ukrainian.meta.stackexchange.com",
+ "unix": "unix.stackexchange.com",
+ "unix.meta": "unix.meta.stackexchange.com",
+ "ux": "ux.stackexchange.com",
+ "ux.meta": "ux.meta.stackexchange.com",
+ "vegetarianism": "vegetarianism.stackexchange.com",
+ "vegetarianism.meta": "vegetarianism.meta.stackexchange.com",
+ "vi": "vi.stackexchange.com",
+ "vi.meta": "vi.meta.stackexchange.com",
+ "webapps": "webapps.stackexchange.com",
+ "webapps.meta": "webapps.meta.stackexchange.com",
+ "webmasters": "webmasters.stackexchange.com",
+ "webmasters.meta": "webmasters.meta.stackexchange.com",
+ "windowsphone": "windowsphone.stackexchange.com",
+ "windowsphone.meta": "windowsphone.meta.stackexchange.com",
+ "woodworking": "woodworking.stackexchange.com",
+ "woodworking.meta": "woodworking.meta.stackexchange.com",
+ "wordpress": "wordpress.stackexchange.com",
+ "wordpress.meta": "wordpress.meta.stackexchange.com",
+ "workplace": "workplace.stackexchange.com",
+ "workplace.meta": "workplace.meta.stackexchange.com",
+ "worldbuilding": "worldbuilding.stackexchange.com",
+ "worldbuilding.meta": "worldbuilding.meta.stackexchange.com",
+ "writers": "writers.stackexchange.com",
+ "writers.meta": "writers.meta.stackexchange.com",
+ "stackoverflow": "stackoverflow.com",
+}