Add 'resume' capability to parallel-rsync-repos

This commit is contained in:
Jacob Vosmaer 2015-12-08 15:08:22 +01:00
parent 6d2be0212c
commit f3ca92a062
2 changed files with 75 additions and 24 deletions

View file

@ -1,31 +1,33 @@
#!/bin/sh
# this script should run as the 'git' user, not root, because of mkdir
#!/usr/bin/env bash
# this script should run as the 'git' user, not root, because 'root' should not
# own intermediate directories created by rsync.
#
# Example invocation:
# find /var/opt/gitlab/git-data/repositories -maxdepth 2 | \
# parallel-rsync-repos /var/opt/gitlab/git-data/repositories /mnt/gitlab/repositories
# parallel-rsync-repos transfer-success.log /var/opt/gitlab/git-data/repositories /mnt/gitlab/repositories
#
# You can also rsync to a remote destination.
#
# parallel-rsync-repos /var/opt/gitlab/git-data/repositories user@host:/mnt/gitlab/repositories
# parallel-rsync-repos transfer-success.log /var/opt/gitlab/git-data/repositories user@host:/mnt/gitlab/repositories
#
# If you need to pass extra options to rsync, set the RSYNC variable
#
# env RSYNC='rsync --rsh="foo bar"' parallel-rsync-repos /src dest
# env RSYNC='rsync --rsh="foo bar"' parallel-rsync-repos transfer-success.log /src dest
#
SRC=$1
DEST=$2
LOGFILE=$1
SRC=$2
DEST=$3
if [ -z "$LOGFILE" ] || [ -z "$SRC" ] || [ -z "$DEST" ] ; then
echo "Usage: $0 LOGFILE SRC DEST"
exit 1
fi
if [ -z "$JOBS" ] ; then
JOBS=10
fi
if [ -z "$SRC" ] || [ -z "$DEST" ] ; then
echo "Usage: $0 SRC DEST"
exit 1
fi
if [ -z "$RSYNC" ] ; then
RSYNC=rsync
fi
@ -35,5 +37,18 @@ if ! cd $SRC ; then
exit 1
fi
sed "s|$SRC|./|" |\
parallel -j$JOBS --progress "mkdir -p $DEST/{} && $RSYNC --delete -a {}/. $DEST/{}/"
rsyncjob() {
relative_dir="./${1#$SRC}"
if ! $RSYNC --delete --relative -a "$relative_dir" "$DEST" ; then
echo "rsync $1 failed"
return 1
fi
echo "$1" >> $LOGFILE
}
export LOGFILE SRC DEST RSYNC
export -f rsyncjob
parallel -j$JOBS --progress rsyncjob

View file

@ -96,25 +96,59 @@ after switching to the new repository storage directory.
### Parallel rsync for all repositories known to GitLab
This will sync repositories with 10 rsync processes at a time.
This will sync repositories with 10 rsync processes at a time. We keep
track of progress so that the transfer can be restarted if necessary.
First we create a new directory, owned by 'git', to hold transfer
logs. We assume the directory is empty before we start the transfer
procedure, and that we are the only ones writing files in it.
```
# Omnibus
sudo gitlab-rake gitlab:list_repos |\
sudo -u git \
/usr/bin/env JOBS=10 \
/opt/gitlab/embedded/service/gitlab-rails/bin/parallel-rsync-repoos \
/var/opt/gitlab/git-data/repositories \
/mnt/gitlab/repositories
sudo mkdir /var/opt/gitlab/transfer-logs
sudo chown git:git /var/opt/gitlab/transfer-logs
# Source
sudo -u git -H mkdir /home/git/transfer-logs
```
We seed the process with a list of the directories we want to copy.
```
# Omnibus
sudo -u git sh -c 'gitlab-rake gitlab:list_repos > /var/opt/gitlab/transfer-logs/all-repos-$(date +%s).txt'
# Source
cd /home/git/gitlab
sudo -u git -H bundle exec rake gitlab:list_repos |\
sudo -u git -H \
sudo -u git -H sh -c 'bundle exec rake gitlab:list_repos > /home/git/transfer-logs/all-repos-$(date +%s).txt'
```
Now we can start the transfer. The command below is idempotent, and
the number of jobs done by GNU Parallel should converge to zero. If it
does not some repositories listed in all-repos-1234.txt may have been
deleted/renamed before they could be copied.
```
# Omnibus
sudo -u git sh -c '
cat /var/opt/gitlab/transfer-logs/* | sort | uniq -u |\
/usr/bin/env JOBS=10 \
/opt/gitlab/embedded/service/gitlab-rails/bin/parallel-rsync-repos \
/var/opt/gitlab/transfer-logs/succes-$(date +%s).log \
/var/opt/gitlab/git-data/repositories \
/mnt/gitlab/repositories
'
# Source
cd /home/git/gitlab
sudo -u git -H sh -c '
cat /home/git/transfer-logs/* | sort | uniq -u |\
/usr/bin/env JOBS=10 \
bin/parallel-rsync-repos \
/home/git/transfer-logs/succes-$(date +%s).log \
/home/git/repositories \
/mnt/gitlab/repositories
`
```
### Parallel rsync only for repositories with recent activity
@ -129,7 +163,8 @@ gitlab:list_repos' to only print repositories with recent activity.
sudo gitlab-rake gitlab:list_repos SINCE='2015-10-1 12:00 UTC' |\
sudo -u git \
/usr/bin/env JOBS=10 \
/opt/gitlab/embedded/service/gitlab-rails/bin/parallel-rsync-repoos \
/opt/gitlab/embedded/service/gitlab-rails/bin/parallel-rsync-repos \
succes-$(date +%s).log \
/var/opt/gitlab/git-data/repositories \
/mnt/gitlab/repositories
@ -139,6 +174,7 @@ sudo -u git -H bundle exec rake gitlab:list_repos SINCE='2015-10-1 12:00 UTC' |\
sudo -u git -H \
/usr/bin/env JOBS=10 \
bin/parallel-rsync-repos \
succes-$(date +%s).log \
/home/git/repositories \
/mnt/gitlab/repositories
```