0%

时钟不同步造成的异常

时钟不同步造成的异常

异常信息:

This scheduler instance () is still active but was recovered by another instance in the cluster

对应源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
protected boolean doCheckin() throws JobPersistenceException {
boolean transOwner = false;
boolean transStateOwner = false;
boolean recovered = false;

Connection conn = getNonManagedTXConnection();
try {
// Other than the first time, always checkin first to make sure there is
// work to be done before we acquire the lock (since that is expensive,
// and is almost never necessary). This must be done in a separate
// transaction to prevent a deadlock under recovery conditions.
List<SchedulerStateRecord> failedRecords = null;
if (!firstCheckIn) {
failedRecords = clusterCheckIn(conn);
commitConnection(conn);
}

if (firstCheckIn || (failedRecords.size() > 0)) {
getLockHandler().obtainLock(conn, LOCK_STATE_ACCESS);
transStateOwner = true;

// Now that we own the lock, make sure we still have work to do.
// The first time through, we also need to make sure we update/create our state record
failedRecords = (firstCheckIn) ? clusterCheckIn(conn) : findFailedInstances(conn);

if (failedRecords.size() > 0) {
getLockHandler().obtainLock(conn, LOCK_TRIGGER_ACCESS);
//getLockHandler().obtainLock(conn, LOCK_JOB_ACCESS);
transOwner = true;

clusterRecover(conn, failedRecords);
recovered = true;
}
}

commitConnection(conn);
} catch (JobPersistenceException e) {
rollbackConnection(conn);
throw e;
} finally {
try {
releaseLock(LOCK_TRIGGER_ACCESS, transOwner);
} finally {
try {
releaseLock(LOCK_STATE_ACCESS, transStateOwner);
} finally {
cleanupConnection(conn);
}
}
}

firstCheckIn = false;

return recovered;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
protected List<SchedulerStateRecord> findFailedInstances(Connection conn)
throws JobPersistenceException {
try {
List<SchedulerStateRecord> failedInstances = new LinkedList<SchedulerStateRecord>();
boolean foundThisScheduler = false;
long timeNow = System.currentTimeMillis();

// 获取 qrzt_scheduler_state 表中,记录。对应sql是:SELECT * FROM QRTZ_SCHEDULER_STATE WHERE SCHED_NAME = 'zl',其中SCHED_NAME是配置文件中的org.quartz.scheduler.instanceName值
List<SchedulerStateRecord> states = getDelegate().selectSchedulerStateRecords(conn, null);

for(SchedulerStateRecord rec: states) {

// find own record...
if (rec.getSchedulerInstanceId().equals(getInstanceId())) {
foundThisScheduler = true;
if (firstCheckIn) {
failedInstances.add(rec);
}
} else {
// find failed instances...
if (calcFailedIfAfter(rec) < timeNow) {
failedInstances.add(rec);
}
}
}

// The first time through, also check for orphaned fired triggers.
if (firstCheckIn) {
failedInstances.addAll(findOrphanedFailedInstances(conn, states));
}

// If not the first time but we didn't find our own instance, then
// Someone must have done recovery for us.
// !foundThisScheduler 表示 应用程序没有找到 自己的 instance
// !firstCheckIn 表示 应该表示 应用程序是否为第一次checkIn
if ((!foundThisScheduler) && (!firstCheckIn)) {
// FUTURE_TODO: revisit when handle self-failed-out impl'ed (see FUTURE_TODO in clusterCheckIn() below)
getLog().warn(
"This scheduler instance (" + getInstanceId() + ") is still " +
"active but was recovered by another instance in the cluster. " +
"This may cause inconsistent behavior.");
}

return failedInstances;
} catch (Exception e) {
lastCheckin = System.currentTimeMillis();
throw new JobPersistenceException("Failure identifying failed instances when checking-in: "
+ e.getMessage(), e);
}
}