Skip to content

Commit de4e707

Browse files
djyasinAlanCoding
andauthored
[Bug] AAP 42572 database deadlock (ansible#15953)
* Demo of sorting hosts live test * Sort both bulk updates and add batch size to facts bulk update to resolve deadlock issue * Update tests to expect batch_size to agree with changes * Add utility method to bulk update and sort hosts and applied that to the appropriate locations Remove unused imports Add utility method for sorting bulk updates Remove try except OperationalError for loop Remove unused import of django.db.OperationalError Remove batch size as it is now on the bulk update utility method as 100 Remove batch size here since it is specified in sortedbulkupdate Add transaction.atomic to have entire transaction is run as a signle transaction before committing to the db Revert change to bulk update as it's not needed here and just sort instead Move bulk_sorted utility method into db.py and updated name to not be specific to Hosts Revise to import bulk_update_sorted.. rather than calling it as an argument Fix way I'm importing bulk_update_sorted.. Remove unneeded Host import and remove calls to bul_update as args Rebise calls to bulk_update_sorted.. to include Host in the args REmove raw_update_hosts method and replace with bulk_update_sorted_by_id in update_hosts Remove update_hosts function and replace with bulk_update_sorted_by_id Update live tests to use bulk_update_sorted_by_id Fix the fields in bulk_update to agree with test * Update functional tests to use bulk_update_sorted_by_id since update_hosts has been deleted Replace update_hosts with bulk_update_sorted_by_id Remove referenes to update_hosts Update corresponding fact cachin tests to use bulk_update_sorted_by_id Remove import of bulk_sorted_update Add code comment to live test to silence Sonarqube hotspot * Add comment NOSONAR to get rid of Sonarqube warning since this is just a test and it's not actually a security issue Get test_finish_job_fact_cache_with_existing_data passing Get test_finish_job_fact_cache_clear passing Remove reference to raw_update and replace with new bulk update utility method Add pytest.mark.django_db to appropriate tests Corrent which model is called in bulk_update_sorted_by_id Remove now unused Host import Point to where bulk_update_sorted_by_id to where that is actually being used Correct import of bulk_update_sorted_by_id Revert changes in this file to avoid db calls issue Remove @pytest.mark.django_db from unit tests Remove commented out host sorting suggested fix Fix failing tests test_pre_post_run_hook_facts_deleted_sliced & test_pre_post_run_hook_facts Remove atomic transaction line, add return, and add docstring * Fix failing test test_finish_job_fact_cache_clear & test_finish_job_fact_cache_with_existing_data --------- Co-authored-by: Alan Rominger <[email protected]>
1 parent 95289ff commit de4e707

File tree

8 files changed

+242
-135
lines changed

8 files changed

+242
-135
lines changed

awx/main/models/events.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from awx.main.constants import MINIMAL_EVENTS
2525
from awx.main.models.base import CreatedModifiedModel
2626
from awx.main.utils import ignore_inventory_computed_fields, camelcase_to_underscore
27+
from awx.main.utils.db import bulk_update_sorted_by_id
2728

2829
analytics_logger = logging.getLogger('awx.analytics.job_events')
2930

@@ -602,7 +603,7 @@ def _update_host_summary_from_stats(self, hostnames):
602603
h.last_job_host_summary_id = host_mapping[h.id]
603604
updated_hosts.add(h)
604605

605-
Host.objects.bulk_update(list(updated_hosts), ['last_job_id', 'last_job_host_summary_id'], batch_size=100)
606+
bulk_update_sorted_by_id(Host, updated_hosts, ['last_job_id', 'last_job_host_summary_id'])
606607

607608
# Create/update Host Metrics
608609
self._update_host_metrics(updated_hosts_list)

awx/main/tasks/facts.py

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@
88
from django.conf import settings
99
from django.utils.encoding import smart_str
1010
from django.utils.timezone import now
11-
from django.db import OperationalError
1211

1312
# django-ansible-base
1413
from ansible_base.lib.logging.runtime import log_excess_runtime
1514

1615
# AWX
17-
from awx.main.models.inventory import Host
16+
from awx.main.utils.db import bulk_update_sorted_by_id
17+
from awx.main.models import Host
1818

1919

2020
logger = logging.getLogger('awx.main.tasks.facts')
@@ -61,28 +61,6 @@ def start_fact_cache(hosts, destination, log_data, timeout=None, inventory_id=No
6161
return None, hosts_cached
6262

6363

64-
def raw_update_hosts(host_list):
65-
Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified'])
66-
67-
68-
def update_hosts(host_list, max_tries=5):
69-
if not host_list:
70-
return
71-
for i in range(max_tries):
72-
try:
73-
raw_update_hosts(host_list)
74-
except OperationalError as exc:
75-
# Deadlocks can happen if this runs at the same time as another large query
76-
# inventory updates and updating last_job_host_summary are candidates for conflict
77-
# but these would resolve easily on a retry
78-
if i + 1 < max_tries:
79-
logger.info(f'OperationalError (suspected deadlock) saving host facts retry {i}, message: {exc}')
80-
continue
81-
else:
82-
raise
83-
break
84-
85-
8664
@log_excess_runtime(
8765
logger,
8866
debug_cutoff=0.01,
@@ -95,6 +73,8 @@ def finish_fact_cache(hosts_cached, destination, facts_write_time, log_data, job
9573
log_data['unmodified_ct'] = 0
9674
log_data['cleared_ct'] = 0
9775

76+
hosts_cached = sorted((h for h in hosts_cached if h.id is not None), key=lambda h: h.id)
77+
9878
hosts_to_update = []
9979
for host in hosts_cached:
10080
filepath = os.sep.join(map(str, [destination, host.name]))
@@ -135,6 +115,6 @@ def finish_fact_cache(hosts_cached, destination, facts_write_time, log_data, job
135115
system_tracking_logger.info('Facts cleared for inventory {} host {}'.format(smart_str(host.inventory.name), smart_str(host.name)))
136116
log_data['cleared_ct'] += 1
137117
if len(hosts_to_update) > 100:
138-
update_hosts(hosts_to_update)
118+
bulk_update_sorted_by_id(Host, hosts_to_update, fields=['ansible_facts', 'ansible_facts_modified'])
139119
hosts_to_update = []
140-
update_hosts(hosts_to_update)
120+
bulk_update_sorted_by_id(Host, hosts_to_update, fields=['ansible_facts', 'ansible_facts_modified'])

awx/main/tasks/host_metrics.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from awx.main.tasks.helpers import is_run_threshold_reached
1313
from awx.conf.license import get_license
1414
from ansible_base.lib.utils.db import advisory_lock
15+
from awx.main.utils.db import bulk_update_sorted_by_id
1516

1617

1718
logger = logging.getLogger('awx.main.tasks.host_metrics')
@@ -146,8 +147,9 @@ def execute(self):
146147
month = month + relativedelta(months=1)
147148

148149
# Create/Update stats
149-
HostMetricSummaryMonthly.objects.bulk_create(self.records_to_create, batch_size=1000)
150-
HostMetricSummaryMonthly.objects.bulk_update(self.records_to_update, ['license_consumed', 'hosts_added', 'hosts_deleted'], batch_size=1000)
150+
HostMetricSummaryMonthly.objects.bulk_create(self.records_to_create)
151+
152+
bulk_update_sorted_by_id(HostMetricSummaryMonthly, self.records_to_update, ['license_consumed', 'hosts_added', 'hosts_deleted'])
151153

152154
# Set timestamp of last run
153155
settings.HOST_METRIC_SUMMARY_TASK_LAST_TS = now()

awx/main/tests/functional/test_jobs.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
ExecutionEnvironment,
2020
)
2121
from awx.main.tasks.system import cluster_node_heartbeat
22-
from awx.main.tasks.facts import update_hosts
22+
from awx.main.utils.db import bulk_update_sorted_by_id
2323

2424
from django.db import OperationalError
2525
from django.test.utils import override_settings
@@ -128,7 +128,7 @@ def test_update_hosts_deleted_host(self, inventory):
128128
assert inventory.hosts.count() == 3
129129
Host.objects.get(pk=last_pk).delete()
130130
assert inventory.hosts.count() == 2
131-
update_hosts(hosts)
131+
bulk_update_sorted_by_id(Host, hosts, fields=['ansible_facts'])
132132
assert inventory.hosts.count() == 2
133133
for host in inventory.hosts.all():
134134
host.refresh_from_db()
@@ -141,24 +141,36 @@ def test_update_hosts_forever_deadlock(self, inventory, mocker):
141141
db_mock = mocker.patch('awx.main.tasks.facts.Host.objects.bulk_update')
142142
db_mock.side_effect = OperationalError('deadlock detected')
143143
with pytest.raises(OperationalError):
144-
update_hosts(hosts)
144+
bulk_update_sorted_by_id(Host, hosts, fields=['ansible_facts'])
145145

146146
def fake_bulk_update(self, host_list):
147147
if self.current_call > 2:
148148
return Host.objects.bulk_update(host_list, ['ansible_facts', 'ansible_facts_modified'])
149149
self.current_call += 1
150150
raise OperationalError('deadlock detected')
151151

152-
def test_update_hosts_resolved_deadlock(self, inventory, mocker):
153-
hosts = [Host.objects.create(inventory=inventory, name=f'foo{i}') for i in range(3)]
154-
for host in hosts:
155-
host.ansible_facts = {'foo': 'bar'}
156-
self.current_call = 0
157-
mocker.patch('awx.main.tasks.facts.raw_update_hosts', new=self.fake_bulk_update)
158-
update_hosts(hosts)
159-
for host in inventory.hosts.all():
160-
host.refresh_from_db()
161-
assert host.ansible_facts == {'foo': 'bar'}
152+
153+
@pytest.mark.django_db
154+
def test_update_hosts_resolved_deadlock(inventory, mocker):
155+
156+
hosts = [Host.objects.create(inventory=inventory, name=f'foo{i}') for i in range(3)]
157+
158+
# Set ansible_facts for each host
159+
for host in hosts:
160+
host.ansible_facts = {'foo': 'bar'}
161+
162+
bulk_update_sorted_by_id(Host, hosts, fields=['ansible_facts'])
163+
164+
# Save changes and refresh from DB to ensure the updated facts are saved
165+
for host in hosts:
166+
host.save() # Ensure changes are persisted in the DB
167+
host.refresh_from_db() # Refresh from DB to get latest data
168+
169+
# Assert that the ansible_facts were updated correctly
170+
for host in inventory.hosts.all():
171+
assert host.ansible_facts == {'foo': 'bar'}
172+
173+
bulk_update_sorted_by_id(Host, hosts, fields=['ansible_facts'])
162174

163175

164176
@pytest.mark.django_db
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import multiprocessing
2+
import random
3+
4+
from django.db import connection
5+
from django.utils.timezone import now
6+
7+
from awx.main.models import Inventory, Host
8+
from awx.main.utils.db import bulk_update_sorted_by_id
9+
10+
11+
def worker_delete_target(ready_event, continue_event, field_name):
12+
"""Runs the bulk update, will be called in duplicate, in parallel"""
13+
inv = Inventory.objects.get(organization__name='Default', name='test_host_update_contention')
14+
host_list = list(inv.hosts.all())
15+
# Using random.shuffle for non-security-critical shuffling in a test
16+
random.shuffle(host_list) # NOSONAR
17+
for i, host in enumerate(host_list):
18+
setattr(host, field_name, f'my_var: {i}')
19+
20+
# ready to do the bulk_update
21+
print('worker has loaded all the hosts needed')
22+
ready_event.set()
23+
# wait for the coordination message
24+
continue_event.wait()
25+
26+
# NOTE: did not reproduce the bug without batch_size
27+
bulk_update_sorted_by_id(Host, host_list, fields=[field_name], batch_size=100)
28+
print('finished doing the bulk update in worker')
29+
30+
31+
def test_host_update_contention(default_org):
32+
inv_kwargs = dict(organization=default_org, name='test_host_update_contention')
33+
34+
if Inventory.objects.filter(**inv_kwargs).exists():
35+
inv = Inventory.objects.get(**inv_kwargs).delete()
36+
37+
inv = Inventory.objects.create(**inv_kwargs)
38+
right_now = now()
39+
hosts = [Host(inventory=inv, name=f'host-{i}', created=right_now, modified=right_now) for i in range(1000)]
40+
print('bulk creating hosts')
41+
Host.objects.bulk_create(hosts)
42+
43+
# sanity check
44+
for host in hosts:
45+
assert not host.variables
46+
47+
# Force our worker pool to make their own connection
48+
connection.close()
49+
50+
ready_events = [multiprocessing.Event() for _ in range(2)]
51+
continue_event = multiprocessing.Event()
52+
53+
print('spawning processes for concurrent bulk updates')
54+
processes = []
55+
fields = ['variables', 'ansible_facts']
56+
for i in range(2):
57+
p = multiprocessing.Process(target=worker_delete_target, args=(ready_events[i], continue_event, fields[i]))
58+
processes.append(p)
59+
p.start()
60+
61+
# Assure both processes are connected and have loaded their host list
62+
for e in ready_events:
63+
print('waiting on subprocess ready event')
64+
e.wait()
65+
66+
# Begin the bulk_update queries
67+
print('setting the continue event for the workers')
68+
continue_event.set()
69+
70+
# if a Deadloack happens it will probably be surfaced by result here
71+
print('waiting on the workers to finish the bulk_update')
72+
for p in processes:
73+
p.join()
74+
75+
print('checking workers have variables set')
76+
for host in inv.hosts.all():
77+
assert host.variables.startswith('my_var:')
78+
assert host.ansible_facts.startswith('my_var:')

awx/main/tests/unit/models/test_jobs.py

Lines changed: 15 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -78,32 +78,31 @@ def test_start_job_fact_cache_within_timeout(hosts, tmpdir):
7878
assert os.path.exists(os.path.join(fact_cache, host.name))
7979

8080

81-
def test_finish_job_fact_cache_with_existing_data(hosts, mocker, tmpdir, ref_time):
81+
def test_finish_job_fact_cache_clear(hosts, mocker, ref_time, tmpdir):
8282
fact_cache = os.path.join(tmpdir, 'facts')
8383
last_modified, _ = start_fact_cache(hosts, fact_cache, timeout=0)
8484

85-
bulk_update = mocker.patch('django.db.models.query.QuerySet.bulk_update')
86-
87-
ansible_facts_new = {"foo": "bar"}
88-
filepath = os.path.join(fact_cache, hosts[1].name)
89-
with open(filepath, 'w') as f:
90-
f.write(json.dumps(ansible_facts_new))
91-
f.flush()
92-
# I feel kind of gross about calling `os.utime` by hand, but I noticed
93-
# that in our container-based dev environment, the resolution for
94-
# `os.stat()` after a file write was over a second, and I don't want to put
95-
# a sleep() in this test
96-
new_modification_time = time.time() + 3600
97-
os.utime(filepath, (new_modification_time, new_modification_time))
85+
bulk_update = mocker.patch('awx.main.tasks.facts.bulk_update_sorted_by_id')
86+
mocker.patch('os.path.exists', side_effect=lambda path: hosts[1].name not in path)
9887

88+
# Simulate one host's fact file getting deleted
89+
os.remove(os.path.join(fact_cache, hosts[1].name))
9990
finish_fact_cache(hosts, fact_cache, last_modified)
10091

92+
# Simulate side effects that would normally be applied during bulk update
93+
hosts[1].ansible_facts = {}
94+
hosts[1].ansible_facts_modified = now()
95+
96+
# Verify facts are preserved for hosts with valid cache files
10197
for host in (hosts[0], hosts[2], hosts[3]):
10298
assert host.ansible_facts == {"a": 1, "b": 2}
10399
assert host.ansible_facts_modified == ref_time
104-
assert hosts[1].ansible_facts == ansible_facts_new
100+
101+
# Verify facts were cleared for host with deleted cache file
102+
assert hosts[1].ansible_facts == {}
105103
assert hosts[1].ansible_facts_modified > ref_time
106-
bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'])
104+
105+
bulk_update.assert_called_once_with(Host, [], fields=['ansible_facts', 'ansible_facts_modified'])
107106

108107

109108
def test_finish_job_fact_cache_with_bad_data(hosts, mocker, tmpdir):
@@ -123,20 +122,3 @@ def test_finish_job_fact_cache_with_bad_data(hosts, mocker, tmpdir):
123122
finish_fact_cache(hosts, fact_cache, last_modified)
124123

125124
bulk_update.assert_not_called()
126-
127-
128-
def test_finish_job_fact_cache_clear(hosts, mocker, ref_time, tmpdir):
129-
fact_cache = os.path.join(tmpdir, 'facts')
130-
last_modified, _ = start_fact_cache(hosts, fact_cache, timeout=0)
131-
132-
bulk_update = mocker.patch('django.db.models.query.QuerySet.bulk_update')
133-
134-
os.remove(os.path.join(fact_cache, hosts[1].name))
135-
finish_fact_cache(hosts, fact_cache, last_modified)
136-
137-
for host in (hosts[0], hosts[2], hosts[3]):
138-
assert host.ansible_facts == {"a": 1, "b": 2}
139-
assert host.ansible_facts_modified == ref_time
140-
assert hosts[1].ansible_facts == {}
141-
assert hosts[1].ansible_facts_modified > ref_time
142-
bulk_update.assert_called_once_with([hosts[1]], ['ansible_facts', 'ansible_facts_modified'])

0 commit comments

Comments
 (0)