From 9cf6337d5e463557863f9edcac820fba4440a1d9 Mon Sep 17 00:00:00 2001 From: Ilya Shakhat Date: Wed, 30 Aug 2017 15:00:07 +0200 Subject: [PATCH] Implement stress injection Add new type of fault called "stress". When activated this fault produces load on CPU, disk, memory or kernel of node. The functionality is implemented with help of stress-ng utility. Node collection API is extended: def stress(self, target, duration=None) Human API is extended, examples of commands: * stress cpu for 20 seconds on controller.domain.tld node * stress disk for 10 seconds on all nodes * stress memory for 60 seconds on all nodes with keystone service Change-Id: I8ddb2292b8dd19f476e4a5071259d1a90cbaa37c Note: 'stress-ng' is required to be installed on target nodes. --- os_faults/ansible/modules/stress.py | 44 ++++++++++++++++++++ os_faults/api/human.py | 23 +++++++++-- os_faults/api/node_collection.py | 13 ++++++ os_faults/tests/unit/api/test_human_api.py | 48 ++++++++++++++++++++++ 4 files changed, 124 insertions(+), 4 deletions(-) create mode 100644 os_faults/ansible/modules/stress.py diff --git a/os_faults/ansible/modules/stress.py b/os_faults/ansible/modules/stress.py new file mode 100644 index 0000000..c40febc --- /dev/null +++ b/os_faults/ansible/modules/stress.py @@ -0,0 +1,44 @@ +#!/usr/bin/python + +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from ansible.module_utils.basic import * # noqa + +STRESSORS_MAP = { + 'cpu': '--cpu 0', + 'disk': '--hdd 0', + 'memory': '--brk 0', + 'kernel': '--kill 0', + 'all': '--all 0', +} + + +def main(): + module = AnsibleModule( + argument_spec=dict( + target=dict(required=True, type='str'), + duration=dict(required=True, type='int') + )) + + target = module.params['target'] + stressor = STRESSORS_MAP.get(target) or STRESSORS_MAP['all'] + duration = module.params['duration'] + + cmd = 'bash -c "stress-ng %s --timeout %ss"' % (stressor, duration) + rc, stdout, stderr = module.run_command(cmd, check_rc=True) + + module.exit_json(cmd=cmd, rc=rc, stderr=stderr, stdout=stdout) + + +if __name__ == '__main__': + main() diff --git a/os_faults/api/human.py b/os_faults/api/human.py index 6c0856c..0ea35c8 100644 --- a/os_faults/api/human.py +++ b/os_faults/api/human.py @@ -30,6 +30,8 @@ Human API understands commands like these (examples): * unfreeze service [on (random|one|single| node[s])] * reboot [random|one|single|] node[s] [with service] * reset [random|one|single|] node[s] [with service] + * stress [cpu|memory|disk|kernel for seconds] on + [random|one|single|] node[s] [with service] * disconnect network on [random|one|single|] node[s] [with service] * connect network on [random|one|single|] node[s] @@ -44,7 +46,8 @@ def list_actions(klazz): hasattr(o, '__public__')))) RANDOMNESS = {'one', 'random', 'some', 'single'} -RANDOMNESS_PATTERN = '|'.join(RANDOMNESS) +ANYTHING = {'all'} +NODE_ALIASES_PATTERN = '|'.join(RANDOMNESS | ANYTHING) SERVICE_ACTIONS = list_actions(service_pkg.Service) SERVICE_ACTIONS_PATTERN = '|'.join(SERVICE_ACTIONS) NODE_ACTIONS = list_actions(node_collection_pkg.NodeCollection) @@ -58,10 +61,12 @@ PATTERNS = [ SERVICE_ACTIONS_PATTERN), re.compile('(?P%s)' '(\s+(?P\w+)\s+network\s+on)?' + '(\s+(?P\w+)' + '(\s+for\s+(?P\d+)\s+seconds)(\s+on)?)?' '(\s+(?P%s|\S+))?' '\s+nodes?' '(\s+with\s+(?P\S+)\s+service)?' % - (NODE_ACTIONS_PATTERN, RANDOMNESS_PATTERN)), + (NODE_ACTIONS_PATTERN, NODE_ALIASES_PATTERN)), ] @@ -82,6 +87,7 @@ def execute(destructor, command): service_name = groups.get('service') node_name = groups.get('node') network_name = groups.get('network') + target = groups.get('target') duration = groups.get('duration') if service_name: @@ -92,7 +98,7 @@ def execute(destructor, command): kwargs = {} if node_name in RANDOMNESS: kwargs['nodes'] = service.get_nodes().pick() - elif node_name: + elif node_name and node_name not in ANYTHING: kwargs['nodes'] = destructor.get_nodes(fqdns=[node_name]) if duration: @@ -110,15 +116,24 @@ def execute(destructor, command): kwargs = {} if network_name: kwargs['network_name'] = network_name + if target: + kwargs['target'] = target + kwargs['duration'] = int(duration) fn = getattr(nodes, action) fn(**kwargs) else: # nodes operation - nodes = destructor.get_nodes(fqdns=[node_name]) + if node_name and node_name not in ANYTHING: + nodes = destructor.get_nodes(fqdns=[node_name]) + else: + nodes = destructor.get_nodes() kwargs = {} if network_name: kwargs['network_name'] = network_name + if target: + kwargs['target'] = target + kwargs['duration'] = int(duration) fn = getattr(nodes, action) fn(**kwargs) diff --git a/os_faults/api/node_collection.py b/os_faults/api/node_collection.py index b396703..116d021 100644 --- a/os_faults/api/node_collection.py +++ b/os_faults/api/node_collection.py @@ -213,3 +213,16 @@ class NodeCollection(utils.ReprMixin): :param network_name: name of network """ raise NotImplementedError + + @public + def stress(self, target, duration=None): + """Stress node OS and hardware + + """ + duration = duration or 10 # defaults to 10 seconds + LOG.info('Stress %s for %ss on nodes %s', target, duration, self) + task = {'stress': { + 'target': target, + 'duration': duration, + }} + self.cloud_management.execute_on_cloud(self.hosts, task) diff --git a/os_faults/tests/unit/api/test_human_api.py b/os_faults/tests/unit/api/test_human_api.py index 06df1e5..93c2808 100644 --- a/os_faults/tests/unit/api/test_human_api.py +++ b/os_faults/tests/unit/api/test_human_api.py @@ -124,6 +124,54 @@ class TestHumanAPI(test.TestCase): destructor.get_nodes.assert_called_once_with(fqdns=['node-2.local']) getattr(nodes, action).assert_called_once() + @ddt.data('cpu', 'memory', 'disk', 'kernel') + def test_stress_by_fqdn(self, target): + action = 'stress' + duration = 20 + destructor = mock.MagicMock() + nodes = mock.MagicMock(node_collection.NodeCollection) + destructor.get_nodes = mock.MagicMock(return_value=nodes) + + command = 'stress %s for %d seconds on node-2.local node' % ( + target, duration) + human.execute(destructor, command) + + destructor.get_nodes.assert_called_once_with(fqdns=['node-2.local']) + getattr(nodes, action).assert_called_once_with( + target=target, duration=duration) + + @ddt.data('cpu', 'memory', 'disk', 'kernel') + def test_stress_target(self, target): + action = 'stress' + duration = 20 + destructor = mock.MagicMock() + nodes = mock.MagicMock(node_collection.NodeCollection) + destructor.get_nodes = mock.MagicMock(return_value=nodes) + + command = 'stress %s for %d seconds on nodes' % (target, duration) + human.execute(destructor, command) + + destructor.get_nodes.assert_called_once_with() + + getattr(nodes, action).assert_called_once_with( + target=target, duration=duration) + + @ddt.data(('CPU', 'cpu', 10, 'keystone'), + ('disk', 'disk', 20, 'nova-api')) + @ddt.unpack + def test_stress_by_service_on_fqdn_node(self, user_target, cmd_target, + duration, service_name): + action = 'stress' + nodes = mock.MagicMock(node_collection.NodeCollection) + self.service.get_nodes.return_value = nodes + + command = 'stress %s for %d seconds on all nodes with %s service' % ( + user_target, duration, service_name) + human.execute(self.destructor, command) + + getattr(nodes, action).assert_called_once_with( + target=cmd_target, duration=duration) + @ddt.data(('Disconnect', 'disconnect'), ('Connect', 'connect')) @ddt.unpack