From 9516ae6ffafa5107ff7e6639c2a2303dc6bc9ae4 Mon Sep 17 00:00:00 2001
From: Christian Breunig <christian@breunig.cc>
Date: Fri, 19 Jul 2024 13:54:26 +0200
Subject: wireless: T6597: improve hostapd startup and corresponding smoketests

This was found during smoketesting as thoase started to repeadingly fail in the last weeks

  File "/usr/libexec/vyos/tests/smoke/cli/test_interfaces_wireless.py", line 534, in test_wireless_security_station_address
      self.assertTrue(process_named_running('hostapd'))
  AssertionError: None is not true

Digging into this revealed that this is NOT related to the smoketest coding but
to hostapd/systemd instead. With a configured WIFI interface and calling:
"sudo systemctl reload-or-restart hostapd@wlan1" multiple times in a short
period caused systemd to report:
"Jul 18 16:15:32 systemd[1]: hostapd@wlan1.service: Deactivated successfully."

According to the internal systemd logic used in our version this is explained by:

  /* If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=, in order to not
   * undo what has already been enqueued. */
  if (unit_stop_pending(UNIT(s)))
          allow_restart = false;

  if (s->result == SERVICE_SUCCESS)
          s->result = f;

  if (s->result == SERVICE_SUCCESS) {
          unit_log_success(UNIT(s));
          end_state = SERVICE_DEAD;`

Where unit_log_success() generates the log message in question.

Improve the restart login in the wireless interface script and an upgrade to
hostapd solved the issue.

(cherry picked from commit a67f49d99eda00998c425f9a663e138dbd0f7755)
---
 src/conf_mode/interfaces_wireless.py | 75 +++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 27 deletions(-)

(limited to 'src')

diff --git a/src/conf_mode/interfaces_wireless.py b/src/conf_mode/interfaces_wireless.py
index 3f01612a2..ff38c979c 100755
--- a/src/conf_mode/interfaces_wireless.py
+++ b/src/conf_mode/interfaces_wireless.py
@@ -19,6 +19,7 @@ import os
 from sys import exit
 from re import findall
 from netaddr import EUI, mac_unix_expanded
+from time import sleep
 
 from vyos.config import Config
 from vyos.configdict import get_interface_dict
@@ -34,6 +35,9 @@ from vyos.template import render
 from vyos.utils.dict import dict_search
 from vyos.utils.kernel import check_kmod
 from vyos.utils.process import call
+from vyos.utils.process import is_systemd_service_active
+from vyos.utils.process import is_systemd_service_running
+from vyos.utils.network import interface_exists
 from vyos import ConfigError
 from vyos import airbag
 airbag.enable()
@@ -87,6 +91,11 @@ def get_config(config=None):
         if wifi.from_defaults(['security', 'wpa']): # if not set by user
             del wifi['security']['wpa']
 
+    # XXX: Jinja2 can not operate on a dictionary key when it starts of with a number
+    if '40mhz_incapable' in (dict_search('capabilities.ht', wifi) or []):
+        wifi['capabilities']['ht']['fourtymhz_incapable'] = wifi['capabilities']['ht']['40mhz_incapable']
+        del wifi['capabilities']['ht']['40mhz_incapable']
+
     if dict_search('security.wpa', wifi) != None:
         wpa_cipher = wifi['security']['wpa'].get('cipher')
         wpa_mode = wifi['security']['wpa'].get('mode')
@@ -114,7 +123,7 @@ def get_config(config=None):
     tmp = find_other_stations(conf, base, wifi['ifname'])
     if tmp: wifi['station_interfaces'] = tmp
 
-    # used in hostapt.conf.j2
+    # used in hostapd.conf.j2
     wifi['hostapd_accept_station_conf'] = hostapd_accept_station_conf.format(**wifi)
     wifi['hostapd_deny_station_conf'] = hostapd_deny_station_conf.format(**wifi)
 
@@ -218,11 +227,6 @@ def verify(wifi):
 def generate(wifi):
     interface = wifi['ifname']
 
-    # always stop hostapd service first before reconfiguring it
-    call(f'systemctl stop hostapd@{interface}.service')
-    # always stop wpa_supplicant service first before reconfiguring it
-    call(f'systemctl stop wpa_supplicant@{interface}.service')
-
     # Delete config files if interface is removed
     if 'deleted' in wifi:
         if os.path.isfile(hostapd_conf.format(**wifi)):
@@ -258,11 +262,6 @@ def generate(wifi):
             mac.dialect = mac_unix_expanded
             wifi['mac'] = str(mac)
 
-    # XXX: Jinja2 can not operate on a dictionary key when it starts of with a number
-    if '40mhz_incapable' in (dict_search('capabilities.ht', wifi) or []):
-        wifi['capabilities']['ht']['fourtymhz_incapable'] = wifi['capabilities']['ht']['40mhz_incapable']
-        del wifi['capabilities']['ht']['40mhz_incapable']
-
     # render appropriate new config files depending on access-point or station mode
     if wifi['type'] == 'access-point':
         render(hostapd_conf.format(**wifi), 'wifi/hostapd.conf.j2', wifi)
@@ -276,23 +275,45 @@ def generate(wifi):
 
 def apply(wifi):
     interface = wifi['ifname']
+    # From systemd source code:
+    # If there's a stop job queued before we enter the DEAD state, we shouldn't act on Restart=,
+    # in order to not undo what has already been enqueued. */
+    #
+    # It was found that calling restart on hostapd will (4 out of 10 cases) deactivate
+    # the service instead of restarting it, when it was not yet properly stopped
+    # systemd[1]: hostapd@wlan1.service: Deactivated successfully.
+    # Thus kill all WIFI service and start them again after it's ensured nothing lives
+    call(f'systemctl stop hostapd@{interface}.service')
+    call(f'systemctl stop wpa_supplicant@{interface}.service')
+
     if 'deleted' in wifi:
-        WiFiIf(interface).remove()
-    else:
-        # Finally create the new interface
-        w = WiFiIf(**wifi)
-        w.update(wifi)
-
-        # Enable/Disable interface - interface is always placed in
-        # administrative down state in WiFiIf class
-        if 'disable' not in wifi:
-            # Physical interface is now configured. Proceed by starting hostapd or
-            # wpa_supplicant daemon. When type is monitor we can just skip this.
-            if wifi['type'] == 'access-point':
-                call(f'systemctl start hostapd@{interface}.service')
-
-            elif wifi['type'] == 'station':
-                call(f'systemctl start wpa_supplicant@{interface}.service')
+        WiFiIf(**wifi).remove()
+        return None
+
+    while (is_systemd_service_running(f'hostapd@{interface}.service') or \
+           is_systemd_service_active(f'hostapd@{interface}.service')):
+        sleep(0.250) # wait 250ms
+
+    # Finally create the new interface
+    w = WiFiIf(**wifi)
+    w.update(wifi)
+
+    # Enable/Disable interface - interface is always placed in
+    # administrative down state in WiFiIf class
+    if 'disable' not in wifi:
+        # Wait until interface was properly added to the Kernel
+        ii = 0
+        while not (interface_exists(interface) and ii < 20):
+            sleep(0.250) # wait 250ms
+            ii += 1
+
+        # Physical interface is now configured. Proceed by starting hostapd or
+        # wpa_supplicant daemon. When type is monitor we can just skip this.
+        if wifi['type'] == 'access-point':
+            call(f'systemctl start hostapd@{interface}.service')
+
+        elif wifi['type'] == 'station':
+            call(f'systemctl start wpa_supplicant@{interface}.service')
 
     return None
 
-- 
cgit v1.2.3