@@ -969,6 +969,13 @@ public void reallyRun() {
969969 }
970970
971971 logger .trace ("End cleanup expired async-jobs" );
972+
973+ // 3) Cleanup orphaned networks stuck in Implementing state without async jobs
974+ try {
975+ cleanupOrphanedNetworks ();
976+ } catch (Throwable e ) {
977+ logger .error ("Unexpected exception when trying to cleanup orphaned networks" , e );
978+ }
972979 } catch (Throwable e ) {
973980 logger .error ("Unexpected exception when trying to execute queue item, " , e );
974981 }
@@ -1284,6 +1291,74 @@ private void cleanupFailedSnapshotsCreatedWithDefaultStrategy(final long msid) {
12841291 }
12851292 }
12861293
1294+ /**
1295+ * Cleanup networks that are stuck in Implementing state without associated async jobs.
1296+ * This only processes networks that have been stuck for longer than the job expiration threshold.
1297+ */
1298+ private void cleanupOrphanedNetworks () {
1299+ try {
1300+ SearchCriteria <NetworkVO > sc = networkDao .createSearchCriteria ();
1301+ sc .addAnd ("state" , SearchCriteria .Op .EQ , Network .State .Implementing );
1302+ sc .addAnd ("removed" , SearchCriteria .Op .NULL );
1303+ List <NetworkVO > implementingNetworks = networkDao .search (sc , null );
1304+
1305+ if (implementingNetworks == null || implementingNetworks .isEmpty ()) {
1306+ return ;
1307+ }
1308+
1309+ logger .debug ("Found {} networks in Implementing state, checking for orphaned networks" , implementingNetworks .size ());
1310+
1311+ final long expireMinutes = JobExpireMinutes .value ();
1312+ final Date cutoffTime = new Date (System .currentTimeMillis () - (expireMinutes * 60 * 1000 ));
1313+
1314+ for (NetworkVO network : implementingNetworks ) {
1315+ if (network .getCreated ().after (cutoffTime )) {
1316+ logger .trace ("Network {} in Implementing state is only {} minutes old (threshold: {} minutes), skipping cleanup" ,
1317+ network .getId (),
1318+ (System .currentTimeMillis () - network .getCreated ().getTime ()) / 60000 ,
1319+ expireMinutes );
1320+ continue ;
1321+ }
1322+
1323+ List <AsyncJobVO > jobs = _jobDao .findInstancePendingAsyncJobs ("Network" , network .getAccountId ());
1324+ boolean hasActiveJob = false ;
1325+ for (AsyncJobVO job : jobs ) {
1326+ if (job .getInstanceId () != null && job .getInstanceId ().equals (network .getId ())) {
1327+ hasActiveJob = true ;
1328+ break ;
1329+ }
1330+ }
1331+
1332+ if (hasActiveJob ) {
1333+ logger .debug ("Network {} in Implementing state has active async job, skipping cleanup" , network .getId ());
1334+ continue ;
1335+ }
1336+
1337+ logger .warn ("Found orphaned network {} in Implementing state without async job. " +
1338+ "Network created: {}, age: {} minutes, expiration threshold: {} minutes. Transitioning to Shutdown state." ,
1339+ network .getId (), network .getCreated (),
1340+ (System .currentTimeMillis () - network .getCreated ().getTime ()) / 60000 ,
1341+ expireMinutes );
1342+ updateNetworkState (network );
1343+
1344+ }
1345+ } catch (Exception e ) {
1346+ logger .error ("Error while cleaning up orphaned networks" , e );
1347+ }
1348+ }
1349+
1350+ private void updateNetworkState (NetworkVO network ) {
1351+ try {
1352+ networkOrchestrationService .stateTransitTo (network , Network .Event .OperationFailed );
1353+ logger .info ("Successfully transitioned orphaned network {} to Shutdown state using state machine" , network .getId ());
1354+ } catch (final NoTransitionException e ) {
1355+ logger .debug ("State transition failed for orphaned network {}, forcing state update" , network .getId ());
1356+ network .setState (Network .State .Shutdown );
1357+ networkDao .update (network .getId (), network );
1358+ logger .info ("Successfully forced orphaned network {} to Shutdown state" , network .getId ());
1359+ }
1360+ }
1361+
12871362 @ Override
12881363 public void onManagementNodeJoined (List <? extends ManagementServerHost > nodeList , long selfNodeId ) {
12891364 }
0 commit comments