@@ -165,11 +165,8 @@ struct afs_status_cb {
* AFS volume synchronisation information
*/
struct afs_volsync {
- unsigned int mask; /* Bitmask of supplied fields */
-#define AFS_VOLSYNC_CREATION 0x01
-#define AFS_VOLSYNC_UPDATE 0x02
- time64_t creation; /* volume creation time */
- time64_t update; /* Volume update time */
+ time64_t creation; /* Volume creation time (or TIME64_MIN) */
+ time64_t update; /* Volume update time (or TIME64_MIN) */
};
/*
@@ -807,8 +807,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
cookie->fids[i].vid = dvnode->fid.vid;
cookie->ctx.actor = afs_lookup_filldir;
cookie->name = dentry->d_name;
- cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want
- * and slot 1 for the directory */
+ cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want
+ * and slot 0 for the directory */
if (!afs_server_supports_ibulk(dvnode))
cookie->one_only = true;
@@ -39,9 +39,8 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
op->net = volume->cell->net;
op->cb_v_break = atomic_read(&volume->cb_v_break);
- op->pre_volsync.mask = READ_ONCE(volume->volsync_mask);
- op->pre_volsync.creation = atomic64_read(&volume->creation_time);
- op->pre_volsync.update = atomic64_read(&volume->update_time);
+ op->pre_volsync.creation = volume->creation_time;
+ op->pre_volsync.update = volume->update_time;
op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
op->nr_iterations = -1;
afs_op_set_error(op, -EDESTADDRREQ);
@@ -165,10 +165,8 @@ static void xdr_decode_AFSVolSync(const __be32 **_bp,
bp++; /* spare6 */
*_bp = bp;
- if (volsync) {
+ if (volsync)
volsync->creation = creation;
- volsync->mask |= AFS_VOLSYNC_CREATION;
- }
}
/*
@@ -212,7 +212,8 @@ static void afs_apply_status(struct afs_operation *op,
vnode->status = *status;
if (vp->dv_before + vp->dv_delta != status->data_version) {
- if (vnode->cb_expires_at != AFS_NO_CB_PROMISE)
+ if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
+ vnode->cb_expires_at != AFS_NO_CB_PROMISE)
pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + vp->dv_delta,
@@ -327,8 +328,6 @@ static void afs_fetch_status_success(struct afs_operation *op)
struct afs_vnode *vnode = vp->vnode;
int ret;
- afs_update_volume_state(op, vp);
-
if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
afs_op_set_error(op, ret);
@@ -512,6 +512,7 @@ struct afs_vldb_entry {
#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+ u8 vlsf_flags[AFS_NMAXNSERVERS];
short error;
u8 nr_servers; /* Number of server records */
u8 name_len;
@@ -595,6 +596,12 @@ struct afs_server {
spinlock_t probe_lock;
};
+enum afs_ro_replicating {
+ AFS_RO_NOT_REPLICATING, /* Not doing replication */
+ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */
+ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */
+} __mode(byte);
+
/*
* Replaceable volume server list.
*/
@@ -606,12 +613,14 @@ struct afs_server_entry {
unsigned long flags;
#define AFS_SE_VOLUME_OFFLINE 0 /* Set if volume offline notice given */
#define AFS_SE_VOLUME_BUSY 1 /* Set if volume busy notice given */
+#define AFS_SE_EXCLUDED 2 /* Set if server is to be excluded in rotation */
};
struct afs_server_list {
struct rcu_head rcu;
refcount_t usage;
bool attached; /* T if attached to servers */
+ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */
unsigned char nr_servers;
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
@@ -647,9 +656,9 @@ struct afs_volume {
unsigned int servers_seq; /* Incremented each time ->servers changes */
/* RO release tracking */
- unsigned int volsync_mask; /* Mask of what values we have obtained */
- atomic64_t creation_time; /* Volume creation time (time64_t) */
- atomic64_t update_time; /* Volume update time (time64_t) */
+ struct mutex volsync_lock; /* Time/state evaluation lock */
+ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */
+ time64_t update_time; /* Volume update time (or TIME64_MIN) */
/* Callback management */
struct mutex cb_check_lock; /* Lock to control race to check after v_break */
@@ -1571,7 +1580,7 @@ extern void afs_fs_exit(void);
* validation.c
*/
bool afs_check_validity(const struct afs_vnode *vnode);
-void afs_update_volume_state(struct afs_operation *op, struct afs_vnode_param *vp);
+int afs_update_volume_state(struct afs_operation *op);
int afs_validate(struct afs_vnode *vnode, struct key *key);
/*
@@ -187,6 +187,22 @@ bool afs_select_fileserver(struct afs_operation *op)
clear_bit(AFS_SE_VOLUME_BUSY,
&op->server_list->servers[op->server_index].flags);
op->cumul_error.responded = true;
+
+ /* We succeeded, but we may need to check the VLDB and redo the
+ * op from another server if we're looking at a set of RO
+ * volumes where some of the servers have not yet been brought
+ * up to date to avoid regressing the data. We only switch to
+ * the new version once >=50% of the servers are updated.
+ */
+ error = afs_update_volume_state(op);
+ if (error != 0) {
+ if (error == 1) {
+ afs_sleep_and_retry(op);
+ goto restart_from_beginning;
+ }
+ afs_op_set_error(op, error);
+ goto failed;
+ }
fallthrough;
default:
/* Success or local failure. Stop. */
@@ -517,10 +533,12 @@ bool afs_select_fileserver(struct afs_operation *op)
best_prio = -1;
for (i = 0; i < op->server_list->nr_servers; i++) {
struct afs_endpoint_state *es;
+ struct afs_server_entry *se = &op->server_list->servers[i];
struct afs_addr_list *sal;
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_server *s = se->server;
if (!test_bit(i, &op->untried_servers) ||
+ test_bit(AFS_SE_EXCLUDED, &se->flags) ||
!test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
continue;
es = op->server_states->endpoint_state;
@@ -598,6 +616,8 @@ bool afs_select_fileserver(struct afs_operation *op)
op->addr_index = addr_index;
set_bit(addr_index, &op->addr_tried);
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
op->call_responded = false;
_debug("address [%u] %u/%u %pISp",
op->server_index, addr_index, alist->nr_addrs,
@@ -31,23 +31,55 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
struct afs_server_list *slist;
struct afs_server *server;
unsigned int type_mask = 1 << volume->type;
- int ret = -ENOMEM, nr_servers = 0, i, j;
-
- for (i = 0; i < vldb->nr_servers; i++)
- if (vldb->fs_mask[i] & type_mask)
- nr_servers++;
+ bool use_newrepsites = false;
+ int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j;
+ u32 tmp;
+
+ /* Work out if we're going to restrict to NEWREPSITE-marked servers or
+ * not. If at least one site is marked as NEWREPSITE, then it's likely
+ * that "vos release" is busy updating RO sites. We cut over from one
+ * to the other when >=50% of the sites have been updated. Sites that
+ * are in the process of being updated are marked DONTUSE.
+ */
+ for (i = 0; i < vldb->nr_servers; i++) {
+ if (!(vldb->fs_mask[i] & type_mask))
+ continue;
+ nr_servers++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ continue;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE)
+ newrep++;
+ }
slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
+ if (newrep) {
+ if (newrep < nr_servers / 2) {
+ kdebug("USE-OLD");
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD;
+ } else {
+ kdebug("USE-NEW");
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW;
+ use_newrepsites = true;
+ }
+ }
+
refcount_set(&slist->usage, 1);
rwlock_init(&slist->lock);
/* Make sure a records exists for each server in the list. */
for (i = 0; i < vldb->nr_servers; i++) {
+ unsigned long se_flags = 0;
+ bool newrepsite = tmp & AFS_VLSF_NEWREPSITE;
+
if (!(vldb->fs_mask[i] & type_mask))
continue;
+ if (tmp & AFS_VLSF_DONTUSE)
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
+ if (newrep && (newrepsite ^ use_newrepsites))
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i],
vldb->addr_version[i]);
@@ -79,6 +111,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
slist->servers[j].server = server;
slist->servers[j].volume = volume;
+ slist->servers[j].flags = se_flags;
slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE;
slist->nr_servers++;
}
@@ -102,14 +135,19 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
bool afs_annotate_server_list(struct afs_server_list *new,
struct afs_server_list *old)
{
+ unsigned long mask = 1UL << AFS_SE_EXCLUDED;
int i;
- if (old->nr_servers != new->nr_servers)
+ if (old->nr_servers != new->nr_servers ||
+ old->ro_replicating != new->ro_replicating)
goto changed;
- for (i = 0; i < old->nr_servers; i++)
+ for (i = 0; i < old->nr_servers; i++) {
if (old->servers[i].server != new->servers[i].server)
goto changed;
+ if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask))
+ goto changed;
+ }
return false;
changed:
return true;
@@ -135,49 +135,115 @@ bool afs_check_validity(const struct afs_vnode *vnode)
return true;
}
+/*
+ * See if the server we've just talked to is currently excluded.
+ */
+static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ const struct afs_server_entry *se;
+ const struct afs_server_list *slist;
+ bool is_excluded = true;
+ int i;
+
+ rcu_read_lock();
+
+ slist = rcu_dereference(volume->servers);
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ if (op->server == se->server) {
+ is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return is_excluded;
+}
+
+/*
+ * Update the volume's server list when the creation time changes and see if
+ * the server we've just talked to is currently excluded.
+ */
+static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret;
+
+ if (__afs_is_server_excluded(op, volume))
+ return 1;
+
+ set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
+ ret = afs_check_volume_status(op->volume, op);
+ if (ret < 0)
+ return ret;
+
+ return __afs_is_server_excluded(op, volume);
+}
+
/*
* Handle a change to the volume creation time in the VolSync record.
*/
-static void afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
+static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
{
- enum afs_cb_break_reason reason = afs_cb_break_for_vos_release;
unsigned int snap;
- time64_t cur = atomic64_read(&volume->creation_time);
+ time64_t cur = volume->creation_time;
time64_t old = op->pre_volsync.creation;
time64_t new = op->volsync.creation;
+ int ret;
_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
- if (!(op->volsync.mask & AFS_VOLSYNC_CREATION) ||
- !(volume->volsync_mask & AFS_VOLSYNC_CREATION)) {
- atomic64_set(&volume->creation_time, new);
- volume->volsync_mask |= AFS_VOLSYNC_CREATION;
- return;
+ if (cur == TIME64_MIN) {
+ volume->creation_time = new;
+ return 0;
}
- if (likely(new == cur))
- return;
+ if (new == cur)
+ return 0;
+
+ /* Try to advance the creation timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur != old)
+ return 0;
/* If the creation time changes in an unexpected way, we need to scrub
* our caches. For a RW vol, this will only change if the volume is
* restored from a backup; for a RO/Backup vol, this will advance when
* the volume is updated to a new snapshot (eg. "vos release").
*/
- if (volume->type == AFSVL_RWVOL || new < old)
- reason = afs_cb_break_for_creation_regress;
+ if (volume->type == AFSVL_RWVOL)
+ goto regressed;
+ if (volume->type == AFSVL_BACKVOL) {
+ if (new < old)
+ goto regressed;
+ goto advance;
+ }
- /* Try to advance the creation timestamp from what we had before the
- * operation to what we got back from the server. This should
- * hopefully ensure that in a race between multiple operations only one
- * of them will do this.
+ /* We have an RO volume, we need to query the VL server and look at the
+ * server flags to see if RW->RO replication is in progress.
*/
- if (atomic64_try_cmpxchg(&volume->creation_time, &old, new)) {
- if (reason == afs_cb_break_for_creation_regress)
- atomic_inc(&volume->cb_scrub);
- else if (volume->type != AFSVL_RWVOL)
- snap = atomic_inc_return(&volume->cb_ro_snapshot);
- trace_afs_cb_v_break(volume->vid, snap, reason);
+ ret = afs_is_server_excluded(op, volume);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ snap = atomic_read(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
+ return ret;
}
+
+advance:
+ snap = atomic_inc_return(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
+ volume->creation_time = new;
+ return 0;
+
+regressed:
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
+ volume->creation_time = new;
+ return 0;
}
/*
@@ -186,20 +252,18 @@ static void afs_update_volume_creation_time(struct afs_operation *op, struct afs
static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
{
enum afs_cb_break_reason reason = afs_cb_break_no_break;
- time64_t cur = atomic64_read(&volume->update_time);
+ time64_t cur = volume->update_time;
time64_t old = op->pre_volsync.update;
time64_t new = op->volsync.update;
_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
- if (!(op->volsync.mask & AFS_VOLSYNC_UPDATE) ||
- !(volume->volsync_mask & AFS_VOLSYNC_UPDATE)) {
- atomic64_set(&volume->update_time, new);
- volume->volsync_mask |= AFS_VOLSYNC_UPDATE;
+ if (cur == TIME64_MIN) {
+ volume->update_time = new;
return;
}
- if (likely(new == cur))
+ if (new == cur)
return;
/* If the volume update time changes in an unexpected way, we need to
@@ -215,33 +279,59 @@ static void afs_update_volume_update_time(struct afs_operation *op, struct afs_v
* hopefully ensure that in a race between multiple operations only one
* of them will do this.
*/
- if (atomic64_try_cmpxchg(&volume->update_time, &old, new)) {
+ if (cur == old) {
if (reason == afs_cb_break_for_update_regress) {
atomic_inc(&volume->cb_scrub);
trace_afs_cb_v_break(volume->vid, 0, reason);
}
+ volume->update_time = new;
+ }
+}
+
+static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret = 0;
+
+ if (likely(op->volsync.creation == volume->creation_time &&
+ op->volsync.update == volume->update_time))
+ return 0;
+
+ mutex_lock(&volume->volsync_lock);
+ if (op->volsync.creation != volume->creation_time) {
+ ret = afs_update_volume_creation_time(op, volume);
+ if (ret < 0)
+ goto out;
}
+ if (op->volsync.update != volume->update_time)
+ afs_update_volume_update_time(op, volume);
+out:
+ mutex_unlock(&volume->volsync_lock);
+ return ret;
}
/*
* Update the state of a volume, including recording the expiration time of the
- * callback promise.
+ * callback promise. Returns 1 to redo the operation from the start.
*/
-void afs_update_volume_state(struct afs_operation *op, struct afs_vnode_param *vp)
+int afs_update_volume_state(struct afs_operation *op)
{
struct afs_server_list *slist = op->server_list;
struct afs_server_entry *se = &slist->servers[op->server_index];
- struct afs_callback *cb = &vp->scb.callback;
+ struct afs_callback *cb = &op->file[0].scb.callback;
struct afs_volume *volume = op->volume;
unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
+ int ret;
_enter("%llx", op->volume->vid);
- if (op->volsync.mask & AFS_VOLSYNC_CREATION)
- afs_update_volume_creation_time(op, volume);
- if (op->volsync.mask & AFS_VOLSYNC_UPDATE)
- afs_update_volume_update_time(op, volume);
+ if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
+ ret = afs_update_volume_times(op, volume);
+ if (ret != 0) {
+ _leave(" = %d", ret);
+ return ret;
+ }
+ }
if (op->cb_v_break == cb_v_break) {
se->cb_expires_at = cb->expires_at;
@@ -249,6 +339,8 @@ void afs_update_volume_state(struct afs_operation *op, struct afs_vnode_param *v
}
if (cb_v_check < op->cb_v_break)
atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
+ _leave(" = %d", ret);
+ return ret;
}
/*
@@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
{
struct afs_uvldbentry__xdr *uvldb;
struct afs_vldb_entry *entry;
- bool new_only = false;
- u32 tmp, nr_servers, vlflags;
+ u32 nr_servers, vlflags;
int i, ret;
_enter("");
@@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
entry->name[i] = 0;
entry->name_len = strlen(entry->name);
- /* If there is a new replication site that we can use, ignore all the
- * sites that aren't marked as new.
- */
- for (i = 0; i < nr_servers; i++) {
- tmp = ntohl(uvldb->serverFlags[i]);
- if (!(tmp & AFS_VLSF_DONTUSE) &&
- (tmp & AFS_VLSF_NEWREPSITE))
- new_only = true;
- }
-
vlflags = ntohl(uvldb->flags);
for (i = 0; i < nr_servers; i++) {
struct afs_uuid__xdr *xdr;
struct afs_uuid *uuid;
+ u32 tmp = ntohl(uvldb->serverFlags[i]);
int j;
int n = entry->nr_servers;
- tmp = ntohl(uvldb->serverFlags[i]);
- if (tmp & AFS_VLSF_DONTUSE ||
- (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
- continue;
if (tmp & AFS_VLSF_RWVOL) {
entry->fs_mask[n] |= AFS_VOL_VTM_RW;
if (vlflags & AFS_VLF_BACKEXISTS)
@@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
for (j = 0; j < 6; j++)
uuid->node[j] = (u8)ntohl(xdr->node[j]);
+ entry->vlsf_flags[n] = tmp;
entry->addr_version[n] = ntohl(uvldb->serverUnique[i]);
entry->nr_servers++;
}
@@ -84,11 +84,15 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type = params->type;
volume->type_force = params->force;
volume->name_len = vldb->name_len;
+ volume->creation_time = TIME64_MIN;
+ volume->update_time = TIME64_MIN;
refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
INIT_WORK(&volume->destructor, afs_destroy_volume);
rwlock_init(&volume->servers_lock);
+ mutex_init(&volume->volsync_lock);
+ mutex_init(&volume->cb_check_lock);
rwlock_init(&volume->cb_v_break_lock);
INIT_LIST_HEAD(&volume->open_mmaps);
init_rwsem(&volume->open_mmaps_lock);
@@ -388,7 +392,11 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
discard = old;
}
- volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
+ /* Check more often if replication is ongoing. */
+ if (new->ro_replicating)
+ volume->update_at = ktime_get_real_seconds() + 2;
+ else
+ volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
write_unlock(&volume->servers_lock);
if (discard == old)
@@ -254,7 +254,6 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp,
update = xdr_to_u64(x->vol_update_date);
do_div(update, 10 * 1000 * 1000);
volsync->update = update;
- volsync->mask |= AFS_VOLSYNC_CREATION | AFS_VOLSYNC_UPDATE;
}
*_bp += xdr_size(x);
@@ -442,7 +442,8 @@ enum yfs_cm_operation {
EM(afs_cb_break_for_unlink, "break-unlink") \
EM(afs_cb_break_for_update_regress, "update-regress") \
EM(afs_cb_break_for_volume_callback, "break-v-cb") \
- E_(afs_cb_break_for_vos_release, "break-vos-release")
+ EM(afs_cb_break_for_vos_release, "break-vos-release") \
+ E_(afs_cb_break_volume_excluded, "vol-excluded")
#define afs_rotate_traces \
EM(afs_rotate_trace_aborted, "Abortd") \