|
| 1 | +import pandas as pd |
| 2 | +import utils.dateutils as dateutils |
| 3 | +from db.mongoqueries import agg_total_by_week, agg_total_per_outlet |
| 4 | +from fastapi import APIRouter, HTTPException, Request, Query |
| 5 | +from schemas.stats_by_date import TotalStatsByDate |
| 6 | +from schemas.stats_weekly import TotalStatsByWeek |
| 7 | + |
| 8 | +outlet_router = APIRouter() |
| 9 | +COLLECTION_NAME = "mediaDaily" |
| 10 | +LOWER_BOUNT_START_DATE = "2018-10-01" |
| 11 | +ID_MAPPING = {"Huffington Post": "HuffPost Canada"} |
| 12 | + |
| 13 | + |
| 14 | +@outlet_router.get( |
| 15 | + "/info_by_date", |
| 16 | + response_model=TotalStatsByDate, |
| 17 | + response_description="Get total and per outlet gender statistics for English outlets between two dates", |
| 18 | +) |
| 19 | +def expertwomen_info_by_date( |
| 20 | + request: Request, |
| 21 | + begin: str = Query(description="Start date in yyyy-mm-dd format"), |
| 22 | + end: str = Query(description="End date in yyyy-mm-dd format"), |
| 23 | +) -> TotalStatsByDate: |
| 24 | + if not dateutils.is_valid_date_range(begin, end, LOWER_BOUNT_START_DATE): |
| 25 | + raise HTTPException( |
| 26 | + status_code=416, |
| 27 | + detail=f"Date range error: Should be between {LOWER_BOUNT_START_DATE} and tomorrow's date", |
| 28 | + ) |
| 29 | + begin = dateutils.convert_date(begin) |
| 30 | + end = dateutils.convert_date(end) |
| 31 | + |
| 32 | + query = agg_total_per_outlet(begin, end) |
| 33 | + response = request.app.connection[COLLECTION_NAME].aggregate(query) |
| 34 | + # Work with the data in pandas |
| 35 | + source_stats = list(response) |
| 36 | + df = pd.DataFrame.from_dict(source_stats) |
| 37 | + df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"] |
| 38 | + # Replace outlet names if necessary |
| 39 | + df["_id"] = df["_id"].replace(ID_MAPPING) |
| 40 | + # Take sums of total males, females, unknowns and articles and convert to dict |
| 41 | + result = df.drop("_id", axis=1).sum().to_dict() |
| 42 | + # Compute per outlet stats |
| 43 | + df["perFemales"] = df["totalFemales"] / df["totalGenders"] |
| 44 | + df["perMales"] = df["totalMales"] / df["totalGenders"] |
| 45 | + df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"] |
| 46 | + df["perArticles"] = df["totalArticles"] / result["totalArticles"] |
| 47 | + # Convert dataframe to dict prior to JSON serialization |
| 48 | + result["sources"] = df.to_dict("records") |
| 49 | + result["perFemales"] = result["totalFemales"] / result["totalGenders"] |
| 50 | + result["perMales"] = result["totalMales"] / result["totalGenders"] |
| 51 | + result["perUnknowns"] = result["totalUnknowns"] / result["totalGenders"] |
| 52 | + return result |
| 53 | + |
| 54 | + |
| 55 | +@outlet_router.get( |
| 56 | + "/weekly_info", |
| 57 | + response_model=TotalStatsByWeek, |
| 58 | + response_description="Get gender statistics per English outlet aggregated WEEKLY between two dates", |
| 59 | +) |
| 60 | +def expertwomen_weekly_info( |
| 61 | + request: Request, |
| 62 | + begin: str = Query(description="Start date in yyyy-mm-dd format"), |
| 63 | + end: str = Query(description="End date in yyyy-mm-dd format"), |
| 64 | +) -> TotalStatsByWeek: |
| 65 | + if not dateutils.is_valid_date_range(begin, end, LOWER_BOUNT_START_DATE): |
| 66 | + raise HTTPException( |
| 67 | + status_code=416, |
| 68 | + detail=f"Date range error: Should be between {LOWER_BOUNT_START_DATE} and tomorrow's date", |
| 69 | + ) |
| 70 | + begin = dateutils.convert_date(begin) |
| 71 | + end = dateutils.convert_date(end) |
| 72 | + |
| 73 | + query = agg_total_by_week(begin, end) |
| 74 | + response = request.app.connection[COLLECTION_NAME].aggregate(query) |
| 75 | + # Work with the data in pandas |
| 76 | + df = ( |
| 77 | + pd.json_normalize(list(response), max_level=1) |
| 78 | + .sort_values(by="_id.outlet") |
| 79 | + .reset_index(drop=True) |
| 80 | + ) |
| 81 | + df.rename( |
| 82 | + columns={ |
| 83 | + "_id.outlet": "outlet", |
| 84 | + "_id.week": "week", |
| 85 | + "_id.year": "year", |
| 86 | + }, |
| 87 | + inplace=True, |
| 88 | + ) |
| 89 | + # Replace outlet names if necessary |
| 90 | + df["outlet"] = df["outlet"].replace(ID_MAPPING) |
| 91 | + # Construct DataFrame and handle begin/end dates as datetimes for summing by week |
| 92 | + df["w_begin"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 0), axis=1) |
| 93 | + df["w_end"] = df.apply(lambda row: dateutils.get_week_bound(row["year"], row["week"], 6), axis=1) |
| 94 | + df["w_begin"], df["w_end"] = zip(*df.apply(lambda row: (pd.to_datetime(row["w_begin"]), pd.to_datetime(row["w_end"])), axis=1)) |
| 95 | + df = ( |
| 96 | + df.drop(columns=["week", "year"], axis=1) |
| 97 | + .sort_values(by=["outlet", "w_begin"]) |
| 98 | + ) |
| 99 | + # In earlier versions, there was a bug due to which we returned weekly information for the same week begin date twice |
| 100 | + # This bug only occurred when the last week of one year spanned into the next year (partial week across a year boundary) |
| 101 | + # To address this, we perform summation of stats by week to avoid duplicate week begin dates being passed to the front end |
| 102 | + df = df.groupby(["outlet", "w_begin", "w_end"]).sum().reset_index() |
| 103 | + df["totalGenders"] = df["totalFemales"] + df["totalMales"] + df["totalUnknowns"] |
| 104 | + df["perFemales"] = df["totalFemales"] / df["totalGenders"] |
| 105 | + df["perMales"] = df["totalMales"] / df["totalGenders"] |
| 106 | + df["perUnknowns"] = df["totalUnknowns"] / df["totalGenders"] |
| 107 | + # Convert datetimes back to string for JSON serialization |
| 108 | + df["w_begin"] = df["w_begin"].dt.strftime("%Y-%m-%d") |
| 109 | + df["w_end"] = df["w_end"].dt.strftime("%Y-%m-%d") |
| 110 | + df = df.drop(columns=["totalGenders", "totalFemales", "totalMales", "totalUnknowns"], axis=1) |
| 111 | + |
| 112 | + # Convert dataframe to dict prior to JSON serialization |
| 113 | + weekly_data = dict() |
| 114 | + for outlet in df["outlet"]: |
| 115 | + per_outlet_data = df[df["outlet"] == outlet].to_dict(orient="records") |
| 116 | + # Remove the outlet key from weekly_data |
| 117 | + [item.pop("outlet") for item in per_outlet_data] |
| 118 | + weekly_data[outlet] = per_outlet_data |
| 119 | + output = {"outlets": weekly_data} |
| 120 | + return output |
0 commit comments