Skip to content

Commit b8ce90a

Browse files
As2org crawler (#172)
Add crawler for CAIDA's AS to organization mappings. --------- Co-authored-by: Malte Tashiro <[email protected]>
1 parent 0b0ef4a commit b8ce90a

File tree

6 files changed

+224
-1
lines changed

6 files changed

+224
-1
lines changed

ACKNOWLEDGMENTS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,10 @@ and
5656

5757
> AS Relationships (serial-1), https://catalog.caida.org/dataset/as_relationships_serial_1
5858
59+
and
60+
61+
> AS to organization mappings, https://catalog.caida.org/dataset/as_organizations/
62+
5963
## Cisco
6064

6165
We use the [Cisco Umbrella Popularity

config.json.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"iyp.crawlers.peeringdb.ix",
7171
"iyp.crawlers.caida.ixs",
7272
"iyp.crawlers.caida.ix_asns",
73+
"iyp.crawlers.caida.as2org",
7374
"iyp.crawlers.cloudflare.top100",
7475
"iyp.crawlers.tranco.top1m",
7576
"iyp.crawlers.openintel.tranco1m",

documentation/data-sources.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
| CAIDA | AS Rank | https://doi.org/10.21986/CAIDA.DATA.AS-RANK | [README](https://github.com/InternetHealthReport/internet-yellow-pages/tree/main/iyp/crawlers/caida#readme) |
1818
| | IXPs Dataset | https://doi.org/10.21986/CAIDA.DATA.IXPS | |
1919
| | AS Relationships | https://catalog.caida.org/dataset/as_relationships_serial_1 | |
20+
| | AS Organizations | https://catalog.caida.org/dataset/as_organizations |
2021
| Cisco | Umbrella Popularity List | https://s3-us-west-1.amazonaws.com/umbrella-static/index.html | [README](https://github.com/InternetHealthReport/internet-yellow-pages/tree/main/iyp/crawlers/cisco#readme) |
2122
| Citizen Lab | URL testing lists | https://github.com/citizenlab/test-lists | [README](https://github.com/InternetHealthReport/internet-yellow-pages/tree/main/iyp/crawlers/citizenlab#readme) |
2223
| Cloudflare | Cloudflare Radar API endpoints radar/dns/top/ases, radar/dns/top/locations, radar/ranking/top, radar/datasets | https://radar.cloudflare.com | [README](https://github.com/InternetHealthReport/internet-yellow-pages/tree/main/iyp/crawlers/cloudflare#readme) |

documentation/node-types.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
| AuthoritativeNameServer | Authoritative DNS nameserver for a set of domain names, uniquely identified with the **name** property. |
1010
| BGPCollector | A RIPE RIS or RouteViews BGP collector, uniquely identified with the **name** property. |
1111
| CaidaIXID | Unique identifier for IXPs from CAIDA's IXP dataset. |
12+
| CaidaOrgID | Identifier for Organizations from CAIDA's AS to organization dataset. |
1213
| Country | Represent an economy, uniquely identified by either its two or three character code (properties **country_code** and **alpha3**). |
1314
| DomainName | Any DNS domain name that is not a FQDN (see HostName), uniquely identified by the **name** property. |
1415
| Estimate | Represent a report that approximate a quantity, for example the World Bank population estimate. |

iyp/crawlers/caida/README.md

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,25 @@ relationship, BGPKIT uses `rel: 1` for customer-provider, whereas CAIDA uses `re
106106

107107
### Dependence
108108

109-
The as_relatonship crawler does not depend on other crawlers.
109+
The as_relatonship crawler does not depend on other crawlers.
110+
111+
## AS Organizations (as2org.py)
112+
113+
AS to organization mappings derived from the quarterly WHOIS dumps.
114+
115+
### Graph representation
116+
117+
```cypher
118+
(:AS {asn: 2497})-[:MANAGED_BY {org_id: '@aut-2497-JPNIC'}]->(:Organization {name: 'Internet Initiative Japan Inc.'})
119+
(:Organization {name: 'Internet Initiative Japan Inc.'})-[:COUNTRY {org_ids: ['@aut-2497-JPNIC']}]->(:Country {country_code: 'JP'})
120+
(:Organization {name: 'Internet Initiative Japan Inc.'})-[:NAME {org_ids: ['@aut-2497-JPNIC']}]->(:Name {name: 'Internet Initiative Japan Inc.'})
121+
(:Organization {name: 'Internet Initiative Japan Inc.'})-[:EXTERNAL_ID]->(:CaidaOrgID {id: '@aut-2497-JPNIC'})
122+
```
123+
124+
We keep track of the org_id assigned by CAIDA in both the relationships and by adding a
125+
`CaidaOrgID` node. The main reason for this is that there are organizations with the
126+
same name but different IDs that may map to different countries.
127+
128+
### Dependence
129+
130+
The as2org crawler does not depend on other crawlers.

iyp/crawlers/caida/as2org.py

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
import argparse
2+
import gzip
3+
import json
4+
import logging
5+
import sys
6+
from collections import defaultdict
7+
from datetime import datetime, timezone
8+
9+
import arrow
10+
import requests
11+
12+
from iyp import BaseCrawler, DataNotAvailableError
13+
14+
# URL to AS2Org API
15+
URL = 'https://publicdata.caida.org/datasets/as-organizations/'
16+
ORG = 'CAIDA'
17+
NAME = 'caida.as2org'
18+
19+
20+
# (:AS)-[:MANAGED_BY]->(:Organization) // Most relevant
21+
# (:Organization)-[:COUNTRY]->(:Country)
22+
# (:Organization)-[:NAME]->(:Name)
23+
24+
class Crawler(BaseCrawler):
25+
26+
def __init__(self, organization, url, name):
27+
super().__init__(organization, url, name)
28+
self.reference['reference_url_info'] = 'https://publicdata.caida.org/datasets/as-organizations/README.txt'
29+
30+
def __set_modification_time_from_metadata_line(self, date_str):
31+
try:
32+
date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc)
33+
self.reference['reference_time_modification'] = date
34+
except (json.JSONDecodeError, KeyError, ValueError) as e:
35+
logging.warning(f'Failed to get modification date from metadata line: {date_str.strip()}')
36+
logging.warning(e)
37+
logging.warning('Using date from filename.')
38+
39+
def run(self):
40+
date = arrow.now()
41+
for _ in range(6):
42+
full_url = URL + f'{date.year}{date.month:02d}01.as-org2info.txt.gz'
43+
req = requests.head(full_url)
44+
45+
# Found the latest file
46+
if req.status_code == 200:
47+
url = full_url
48+
break
49+
50+
date = date.shift(months=-1)
51+
52+
else:
53+
# for loop was not 'broken', no file available
54+
raise DataNotAvailableError('No recent CAIDA as2org file available')
55+
date = date.datetime.replace(day=1, hour=0, minute=0, second=0, microsecond=0, tzinfo=timezone.utc)
56+
self.reference['reference_time_modification'] = date
57+
self.reference['reference_url_data'] = url
58+
59+
logging.info(f'Fetching data from: {url}')
60+
req = requests.get(self.url)
61+
req.raise_for_status()
62+
63+
logging.info('Processing data...')
64+
65+
data = gzip.decompress(req.content).decode()
66+
lines = data.split('\n')
67+
68+
lines = [line for line in lines if line.strip()]
69+
70+
orgs_mode = True
71+
asn_orgid = dict()
72+
name_country_orgids = defaultdict(lambda: defaultdict(set))
73+
name_orgids = defaultdict(set)
74+
orgid_name = dict()
75+
countries = set()
76+
for line in lines:
77+
if line == '# format:org_id|changed|org_name|country|source':
78+
orgs_mode = True
79+
elif line == '# format:aut|changed|aut_name|org_id|opaque_id|source':
80+
orgs_mode = False
81+
elif 'program start time' in line:
82+
date = line.split('program start time:')[1]
83+
date = date.strip()
84+
self.__set_modification_time_from_metadata_line(date)
85+
86+
if line.startswith('#'):
87+
continue
88+
89+
fields = line.split('|')
90+
91+
# extract org information with format:
92+
# org_id|changed|org_name|country|source
93+
# NB changed and source fields not used
94+
if orgs_mode:
95+
org_id = fields[0]
96+
if org_id.startswith('@del'):
97+
# There are some placeholder organizations with no name and IDs
98+
# starting with @del, which probably indicate some old relationship
99+
# that no longer exists. Does not make sense to model them, since
100+
# they all map to the same Organization node with an empty name.
101+
continue
102+
org_name = fields[2]
103+
country = fields[3]
104+
# Index by name, since this the identifier of the Organization node.
105+
# Keep track of which org ID is the source for a country. Some orgs have
106+
# multiple IDs mapping them to different countries,
107+
name_country_orgids[org_name][country].add(org_id)
108+
countries.add(country)
109+
# Some orgs (with the same name) map to multiple IDs.
110+
name_orgids[org_name].add(org_id)
111+
orgid_name[org_id] = org_name
112+
113+
# extract org to as mapping with format:
114+
# aut|changed|aut_name|org_id|opaque_id|source
115+
# NB changed, aut_name, opaque_id, and source fields not used
116+
else:
117+
asn = int(fields[0])
118+
org_id = fields[3]
119+
if org_id.startswith('@del'):
120+
continue
121+
asn_orgid[asn] = org_id
122+
123+
names = set(name_orgids.keys())
124+
org_ids = set(orgid_name.keys())
125+
ases = set(asn_orgid.keys())
126+
caida_org_id = self.iyp.batch_get_nodes_by_single_prop('CaidaOrgID', 'id', org_ids)
127+
as_id = self.iyp.batch_get_nodes_by_single_prop('AS', 'asn', ases)
128+
organization_id = self.iyp.batch_get_nodes_by_single_prop('Organization', 'name', names)
129+
name_id = self.iyp.batch_get_nodes_by_single_prop('Name', 'name', names)
130+
country_id = self.iyp.batch_get_nodes_by_single_prop('Country', 'country_code', countries)
131+
132+
managed_links = list()
133+
for asn, org_id in asn_orgid.items():
134+
org_qid = organization_id[orgid_name[org_id]]
135+
asn_qid = as_id[asn]
136+
managed_links.append({'src_id': asn_qid, 'dst_id': org_qid,
137+
'props': [self.reference, {'org_id': org_id}]})
138+
139+
name_links = list()
140+
country_links = list()
141+
external_id_links = list()
142+
143+
for name in name_orgids:
144+
org_ids = name_orgids[name]
145+
org_qid = organization_id[name]
146+
name_qid = name_id[name]
147+
148+
name_links.append({'src_id': org_qid, 'dst_id': name_qid,
149+
'props': [self.reference, {'org_ids': list(org_ids)}]})
150+
151+
for org_id in org_ids:
152+
caida_org_id_qid = caida_org_id[org_id]
153+
external_id_links.append({'src_id': org_qid, 'dst_id': caida_org_id_qid, 'props': [self.reference]})
154+
155+
for country, org_ids in name_country_orgids[name].items():
156+
country_qid = country_id[country]
157+
country_links.append({'src_id': org_qid, 'dst_id': country_qid,
158+
'props': [self.reference, {'org_ids': list(org_ids)}]})
159+
160+
self.iyp.batch_add_links('COUNTRY', country_links)
161+
self.iyp.batch_add_links('EXTERNAL_ID', external_id_links)
162+
self.iyp.batch_add_links('MANAGED_BY', managed_links)
163+
self.iyp.batch_add_links('NAME', name_links)
164+
165+
def unit_test(self):
166+
return super().unit_test(['COUNTRY', 'EXTERNAL_ID', 'MANAGED_BY', 'NAME'])
167+
168+
169+
def main() -> None:
170+
parser = argparse.ArgumentParser()
171+
parser.add_argument('--unit-test', action='store_true')
172+
args = parser.parse_args()
173+
174+
FORMAT = '%(asctime)s %(levelname)s %(message)s'
175+
logging.basicConfig(
176+
format=FORMAT,
177+
filename='log/' + NAME + '.log',
178+
level=logging.INFO,
179+
datefmt='%Y-%m-%d %H:%M:%S'
180+
)
181+
182+
logging.info(f'Started: {sys.argv}')
183+
184+
crawler = Crawler(ORG, URL, NAME)
185+
if args.unit_test:
186+
crawler.unit_test()
187+
else:
188+
crawler.run()
189+
crawler.close()
190+
logging.info(f'Finished: {sys.argv}')
191+
192+
193+
if __name__ == '__main__':
194+
main()
195+
sys.exit(0)

0 commit comments

Comments
 (0)