Skip to content

Commit d648e8a

Browse files
Enhance K8s extn for node failure cases (#2033)
Signed-off-by: Yunus Qureshi <[email protected]>
1 parent 2785bee commit d648e8a

File tree

2 files changed

+62
-16
lines changed

2 files changed

+62
-16
lines changed

OracleDatabase/SingleInstance/extensions/k8s/lock.py

Lines changed: 59 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,42 +18,84 @@
1818
import argparse
1919
import fcntl
2020
import tempfile
21+
import threading, subprocess
2122
from multiprocessing.connection import Listener, Client
2223

2324
# Multiprocess communication auth key
2425
AUTHKEY = 'vkidSQkgAHc='
26+
DIR_LOCK_FILE = os.sep + '.dirlock'
2527

26-
27-
def acquire_lock(lock_file, sock_file, block):
28+
def acquire_lock(lock_file, sock_file, block, heartbeat):
2829
"""
2930
Acquire a lock on the passed file, block if needed
3031
:param lock_file:
3132
:param sock_file:
3233
:param block:
3334
:return:
3435
"""
35-
print('[%s]: Acquiring lock on %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), lock_file))
36-
lock_handle = open(lock_file, 'w')
36+
37+
# get dir lock first to check lock file existence
38+
with open(os.path.dirname(lock_file) + DIR_LOCK_FILE, 'w') as dir_lh:
39+
fcntl.flock(dir_lh, fcntl.LOCK_EX)
40+
if not os.path.exists(lock_file):
41+
print('[%s]: Creating %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(lock_file)))
42+
open(lock_file, 'w').close()
43+
44+
lock_handle = open(lock_file)
45+
print('[%s]: Acquiring lock %s with heartbeat %s secs' %
46+
(time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(lock_file), heartbeat))
3747
while True:
3848
try:
3949
fcntl.flock(lock_handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
40-
print('[%s]: Lock acquired on %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), lock_file))
50+
print('[%s]: Lock acquired' % (time.strftime('%Y:%m:%d %H:%M:%S')))
51+
with open(os.path.dirname(lock_file) + DIR_LOCK_FILE, 'w') as dir_lh:
52+
fcntl.flock(dir_lh, fcntl.LOCK_EX)
53+
print('[%s]: Starting heartbeat' % (time.strftime('%Y:%m:%d %H:%M:%S')))
54+
os.utime(lock_file, None)
4155
break
4256
except IOError as e:
4357
if not block:
4458
print(e)
4559
return 1
60+
4661
time.sleep(0.1)
4762

63+
# to handle stale NFS locks
64+
pulse = int(time.time() - os.path.getmtime(lock_file))
65+
if heartbeat < pulse:
66+
# something is wrong
67+
print('[%s]: Lost heartbeat by %s secs' % (time.strftime('%Y:%m:%d %H:%M:%S'), pulse))
68+
lock_handle.close()
69+
# get dir lock
70+
with open(os.path.dirname(lock_file) + DIR_LOCK_FILE, 'w') as dir_lh:
71+
fcntl.flock(dir_lh, fcntl.LOCK_EX)
72+
# pulse check again after acquring dir lock
73+
if heartbeat < int(time.time() - os.path.getmtime(lock_file)):
74+
print('[%s]: Recreating %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(lock_file)))
75+
os.remove(lock_file)
76+
open(lock_file, 'w').close()
77+
78+
lock_handle = open(lock_file)
79+
print('[%s]: Reacquiring lock %s' %
80+
(time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(lock_file)))
81+
82+
4883
if os.fork():
4984
return 0
5085
else:
5186
# Spawn a child process to hold on to the lock
5287
if os.path.exists(sock_file):
5388
os.remove(sock_file)
54-
print('[%s]: Holding on to the lock using %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), sock_file))
89+
print('[%s]: Lock held %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(lock_file)))
5590
listener = Listener(address=sock_file, authkey=AUTHKEY)
5691

92+
def listen():
93+
while True:
94+
conn = listener.accept()
95+
if conn.recv():
96+
break
97+
release()
98+
5799
def release(sig=None, frame=None):
58100
"""
59101
Release if the process is stopped/terminated
@@ -67,15 +109,15 @@ def release(sig=None, frame=None):
67109
time.sleep(30)
68110
lock_handle.close()
69111
listener.close()
70-
print('[%s]: Lock released on %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), lock_file))
112+
print('[%s]: Lock released %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(lock_file)))
71113

72114
signal.signal(signal.SIGTERM, release)
73115
signal.signal(signal.SIGINT, release)
74-
while True:
75-
conn = listener.accept()
76-
if conn.recv():
77-
break
78-
release()
116+
threading.Thread(target=listen).start()
117+
118+
while not lock_handle.closed:
119+
os.utime(lock_file, None)
120+
time.sleep(5)
79121

80122

81123
def check_lock(sock_file):
@@ -90,7 +132,7 @@ def check_lock(sock_file):
90132
cl = Client(address=sock_file, authkey=AUTHKEY)
91133
cl.send(False)
92134
cl.close()
93-
print('[%s]: Lock held' % (time.strftime('%Y:%m:%d %H:%M:%S')))
135+
print('[%s]: Lock held %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(sock_file)))
94136
return 0
95137

96138

@@ -102,7 +144,7 @@ def release_lock(sock_file):
102144
"""
103145
if not os.path.exists(sock_file):
104146
return 1
105-
print('[%s]: Connecting to the lock process %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), sock_file))
147+
print('[%s]: Releasing lock %s' % (time.strftime('%Y:%m:%d %H:%M:%S'), os.path.basename(sock_file)))
106148
cl = Client(address=sock_file, authkey=AUTHKEY)
107149
cl.send(True)
108150
cl.close()
@@ -120,14 +162,16 @@ def main():
120162
parser.add_argument('--release', action='store_true', dest='release')
121163
parser.add_argument('--file', dest='lock_file')
122164
parser.add_argument('--block', action='store_true', dest='block')
165+
# heartbeat in secs
166+
parser.add_argument('--heartbeat', type=int, dest='heartbeat', default=30)
123167
args = parser.parse_args()
124168
if not args.lock_file:
125169
parser.print_help()
126170
sys.exit()
127171
# Derive sock_file name from lock_file
128172
sock_file = os.path.join(tempfile.gettempdir(), os.path.basename(args.lock_file))
129173
if args.acquire:
130-
sys.exit(acquire_lock(args.lock_file, sock_file, args.block))
174+
sys.exit(acquire_lock(args.lock_file, sock_file, args.block, args.heartbeat))
131175
elif args.check:
132176
sys.exit(check_lock(sock_file))
133177
elif args.release:

OracleDatabase/SingleInstance/extensions/k8s/startDB.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@ fi;
2020

2121
export ORACLE_SID=$(grep "$ORACLE_HOME" /etc/oratab | cut -d: -f1)
2222

23-
# Start database in nomount mode, shutdown first to abort any zombie procs on restart
23+
# Clean up any left over zombie procs from container crash, start database in nomount mode
24+
ipcs -m | awk ' /[0-9]/ {print $2}' | xargs -n1 ipcrm -m 2> /dev/null
25+
ipcs -s | awk ' /[0-9]/ {print $2}' | xargs -n1 ipcrm -s 2> /dev/null
2426
for i in {1..10}; do
2527
sqlplus / as sysdba << EOF
2628
shutdown abort;

0 commit comments

Comments
 (0)