diff --git a/.github/workflows/fetch-data.yml b/.github/workflows/fetch-data.yml index 7c0f19d2..fc50a490 100644 --- a/.github/workflows/fetch-data.yml +++ b/.github/workflows/fetch-data.yml @@ -7,6 +7,8 @@ on: jobs: Fetch-Route-Data: + permissions: + contents: write runs-on: ubuntu-latest steps: @@ -15,7 +17,7 @@ jobs: - name: Setup Python environment uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.12' architecture: 'x64' cache: 'pip' cache-dependency-path: crawling/requirements.txt @@ -45,6 +47,7 @@ jobs: python ./crawling/matchGtfs.py python ./crawling/cleansing.py python ./crawling/mergeRoutes.py + python ./crawling/groupBus.py python ./crawling/routeCompare.py python ./crawling/mtrExits.py @@ -67,12 +70,21 @@ jobs: ROUTE_BUS.xml route-ts/ exits.mtr.json + groupBus_all.json - name: Update MD5 run: md5sum routeFareList.min.json | cut -f1 -d ' ' | tr -d $'\n' > routeFareList.md5 - name: create deployment folder run: mkdir -p build - name: cp files into deployment folder - run: cp -r routeFareList.json routeFareList.min.json routeFareList.md5 CNAME exits.mtr.json route-ts build/ + run: | + cp -r \ + routeFareList.json \ + routeFareList.min.json \ + routeFareList.md5 \ + CNAME \ + exits.mtr.json \ + groupBus_all.json \ + build/ - name: cp route-ts into deployment folder run: cp -r route-ts build - name: Update resources diff --git a/crawling/groupBus.py b/crawling/groupBus.py new file mode 100644 index 00000000..53322a34 --- /dev/null +++ b/crawling/groupBus.py @@ -0,0 +1,92 @@ +from scipy.spatial import KDTree +import json +import math +import polars as pl + +def haversine_distance(lat1, lon1, lat2, lon2): + R = 6371000 # Earth radius in meters + + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + delta_phi = math.radians(lat2 - lat1) + delta_lambda = math.radians(lon2 - lon1) + + a = math.sin(delta_phi/2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda/2)**2 + c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) + + return R * c + +def calculate_bearing(lat1, lon1, lat2, lon2): + phi1 = math.radians(lat1) + phi2 = math.radians(lat2) + delta_lambda = math.radians(lon2 - lon1) + + y = math.sin(delta_lambda) * math.cos(phi2) + x = math.cos(phi1) * math.sin(phi2) - math.sin(phi1) * math.cos(phi2) * math.cos(delta_lambda) + theta = math.atan2(y, x) + + return (math.degrees(theta) + 360) % 360 + +def group_bus_stops(bus_stops, max_distance=50, bearing_threshold=35): + tree = KDTree(bus_stops.select("lat", "lng")) + groups = pl.DataFrame(schema={"id":str, "lat":pl.Float64, "lng":pl.Float64, "name_en":str, "name_zh":str, "bus_group_id":pl.Int32}) + visited = set() + group_id = 1 + + print(len(bus_stops)) + + for i in range(len(bus_stops)): + if i in visited: + continue + + # Create a new group for this stop + stop1 = bus_stops[i] + stop1 = stop1.with_columns( + bus_group_id=group_id + ) + group = stop1 + nearby_stop_indices = tree.query_ball_point([stop1['lat'][0], stop1['lng'][0]], r=max_distance/1000) + + for j in nearby_stop_indices: + if i != j and j not in visited: + stop2 = bus_stops[j] + + distance = haversine_distance(stop1['lat'][0], stop1['lng'][0], stop2['lat'][0], stop2['lng'][0]) + + if distance <= max_distance: + if group.height > 1: + prev_stop = group[-2] + bearing1 = calculate_bearing(prev_stop['lat'][0], prev_stop['lng'][0], stop1['lat'][0], stop1['lng'][0]) + bearing2 = calculate_bearing(stop1['lat'][0], stop1['lng'][0], stop2['lat'][0], stop2['lng'][0]) + + if abs(bearing1 - bearing2) <= bearing_threshold or abs(bearing1 - bearing2) >= 360 - bearing_threshold: + stop2 = stop2.with_columns( + bus_group_id=group_id + ) + group = group.vstack(stop2) + else: + stop2 = stop2.with_columns( + bus_group_id=group_id + ) + group = group.vstack(stop2) + + group_id += 1 + visited.add(i) + groups = groups.vstack(group) + + return groups + +if __name__ == '__main__': + with open("routeFareList.min.json", 'r', encoding='utf8') as f: + r = json.load(f) + r = r['stopList'] + + j2 = [{"id": id, "lat": v['location']['lat'], "lng": v['location']['lng'], + "name_en": v['name']['en'], "name_zh": v['name']['zh']} for id, v in r.items()] + + df = pl.from_dicts(j2) #.lazy() + #df = df.filter(pl.col('name_zh').str.contains('宋皇')) + grouped_bus_stops = group_bus_stops(df) + + with open(f'groupBus_all.json', 'w', encoding='utf8') as f: + f.write(grouped_bus_stops.write_json()) \ No newline at end of file diff --git a/crawling/parseGtfs.py b/crawling/parseGtfs.py index b1a95384..f51ef312 100644 --- a/crawling/parseGtfs.py +++ b/crawling/parseGtfs.py @@ -97,7 +97,7 @@ async def parseGtfs(): _tmp.sort(key=takeFirst) routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp] - nameReg = re.compile('\[(.*)\] (.*)') + nameReg = re.compile(r'\[(.*)\] (.*)') def parseStopName(name): ret = {} for str in name.split('|'): diff --git a/crawling/parseGtfsEn.py b/crawling/parseGtfsEn.py index 42436557..c67d5ce3 100644 --- a/crawling/parseGtfsEn.py +++ b/crawling/parseGtfsEn.py @@ -97,7 +97,7 @@ async def parseGtfs(): _tmp.sort(key=takeFirst) routeList[route_id]['fares'][bound] = [v[0] for k,v in _tmp] - nameReg = re.compile('\[(.*)\] (.*)') + nameReg = re.compile(r'\[(.*)\] (.*)') def parseStopName(name): ret = {} for str in name.split('|'): diff --git a/crawling/requirements.txt b/crawling/requirements.txt index a76cdd79..49d263c2 100644 --- a/crawling/requirements.txt +++ b/crawling/requirements.txt @@ -12,7 +12,9 @@ PySocks==1.7.1 six==1.15.0 urllib3==1.26.4 wheel==0.36.2 -pyproj==3.3.0 +pyproj==3.6.1 httpx==0.25.2 xxhash==3.2.0 +polars==1.7.1 +scipy==1.14.1 -e .