NP04 Disk Servers

RAID recovery

mdadm --zero-superblock /dev/sdXX

systemctl start dev-md1-device

np04-srv-002

[root@np04-srv-002 ~]# mdadm --add /dev/md0 /dev/sda
mdadm: added /dev/sda

np04-srv-004 RAID recovery

devices /dev/sdae /dev/sdl /dev/sdaf /dev/sdp /dev/sdag /dev/sdm /dev/sdk /dev/sdac /dev/sdab /dev/sdn /dev/sdad /dev/sdq

mdadm --create /dev/md1 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sdae /dev/sdl /dev/sdaf /dev/sdp /dev/sdag /dev/sdm /dev/sdk /dev/sdac /dev/sdab /dev/sdn /dev/sdad /dev/sdq

[root@np04-srv-004 ~]# mdadm --assemble /dev/md1 /dev/sdae /dev/sdl /dev/sdaf /dev/sdp /dev/sdag /dev/sdm /dev/sdk /dev/sdac /dev/sdab /dev/sdn /dev/sdad /dev/sdq
mdadm: no recogniseable superblock on /dev/sdae
mdadm: /dev/sdae has no superblock - assembly aborted

mdadm --zero-superblock /dev/sdae /dev/sdl /dev/sdaf /dev/sdp /dev/sdag /dev/sdm /dev/sdk /dev/sdac /dev/sdab /dev/sdn /dev/sdad /dev/sdq

[root@np04-srv-004 ~]# mdadm --assemble /dev/md1 /dev/sdae /dev/sdl /dev/sdaf /dev/sdp /dev/sdag /dev/sdm /dev/sdk /dev/sdac /dev/sdab /dev/sdn /dev/sdad /dev/sdq
mdadm: Found some drive for an array that is already active: /dev/md/1
mdadm: giving up.

[root@np04-srv-004 ~]# mdadm --examine --verbose --scan
ARRAY /dev/md/boot  level=raid1 metadata=1.2 num-devices=2 UUID=ac6d4fc2:32897409:aefaf5f8:ff794e70 name=np04-srv-004:boot
   devices=/dev/sdax1,/dev/sdaw1
ARRAY /dev/md/pv00  level=raid1 metadata=1.2 num-devices=2 UUID=ff67f08c:30df410c:d978e279:f9be6da4 name=np04-srv-004:pv00
   devices=/dev/sdax2,/dev/sdaw2
ARRAY /dev/md/3  level=raid5 metadata=1.2 num-devices=11 UUID=41fe515a:816e967a:185c239d:5ecbe9dd name=np04-srv-004:3
   spares=1   devices=/dev/sds,/dev/sdas,/dev/sdau,/dev/sdu,/dev/sdat,/dev/sdar,/dev/sdw,/dev/sdaq,/dev/sdt,/dev/sdav,/dev/sdv,/dev/sdx
ARRAY /dev/md/0  level=raid5 metadata=1.2 num-devices=11 UUID=96bab78f:db437cbe:d9912422:9cfb314f name=np04-srv-004:0
   spares=1   devices=/dev/sdb,/dev/sdg,/dev/sdd,/dev/sde,/dev/sdaa,/dev/sdy,/dev/sdz,/dev/sdh,/dev/sdc,/dev/sdi,/dev/sda,/dev/sdf
ARRAY /dev/md/1  level=raid5 metadata=1.2 num-devices=11 UUID=42e3a39a:1144f035:237d0cd0:04ddaf36 name=np04-srv-004:1
   spares=1   devices=/dev/sdae,/dev/sdl,/dev/sdaf,/dev/sdp,/dev/sdag,/dev/sdm,/dev/sdk,/dev/sdac,/dev/sdab,/dev/sdn,/dev/sdad,/dev/sdq
ARRAY /dev/md/2  level=raid5 metadata=1.2 num-devices=11 UUID=bf465668:51771dcd:70d9053f:e18e2725 name=np04-srv-004:2
   spares=1   devices=/dev/sdao,/dev/sdam,/dev/sdai,/dev/sdan,/dev/sdah,/dev/sdo,/dev/sdak,/dev/sdap,/dev/sdr,/dev/sdj,/dev/sdaj,/dev/sdal

[root@np04-srv-004 ~]# mdadm --detail --scan
ARRAY /dev/md/pv00 metadata=1.2 name=np04-srv-004:pv00 UUID=ff67f08c:30df410c:d978e279:f9be6da4
ARRAY /dev/md/boot metadata=1.2 name=np04-srv-004:boot UUID=ac6d4fc2:32897409:aefaf5f8:ff794e70
ARRAY /dev/md/3 metadata=1.2 spares=1 name=np04-srv-004:3 UUID=41fe515a:816e967a:185c239d:5ecbe9dd
ARRAY /dev/md/0 metadata=1.2 spares=1 name=np04-srv-004:0 UUID=96bab78f:db437cbe:d9912422:9cfb314f
INACTIVE-ARRAY /dev/md1 metadata=1.2
ARRAY /dev/md/2 metadata=1.2 spares=1 name=np04-srv-004:2 UUID=bf465668:51771dcd:70d9053f:e18e2725

[root@np04-srv-004 ~]# mdadm --assemble /dev/md1 /dev/sda[c-h] /dev/sd[l-n] /dev/sdq /dev/sds /dev/sdo
mdadm: failed to RUN_ARRAY /dev/md1: Input/output error
[root@np04-srv-004 ~]# mdadm --assemble /dev/md1 /dev/sda[c-h] /dev/sd[l-n] /dev/sdq /dev/sds
mdadm: /dev/md1 has been started with 11 drives.

np04-srv-003 RAID recovery

devices /dev/sdad /dev/sdah /dev/sdaj /dev/sdak /dev/sdaf /dev/sdab /dev/sdag /dev/sdal /dev/sdai /dev/sdac /dev/sdaa /dev/sdae

mdadm --create /dev/md2 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sdad /dev/sdah /dev/sdaj /dev/sdak /dev/sdaf /dev/sdab /dev/sdag /dev/sdal /dev/sdai /dev/sdac /dev/sdaa /dev/sdae

mdadm --assemble /dev/md2 /dev/sdad /dev/sdah /dev/sdaj /dev/sdak /dev/sdaf /dev/sdab /dev/sdag /dev/sdal /dev/sdai /dev/sdac /dev/sdaa /dev/sdae

[root@np04-srv-003 ~]# mdadm --examine --verbose --scan
ARRAY /dev/md/boot  level=raid1 metadata=1.0 num-devices=2 UUID=5c82b8ac:5442a890:afc0f7de:0dac9e4c name=np04-srv-003:boot
   devices=/dev/sdb1,/dev/sda1
ARRAY /dev/md/pv.1  level=raid1 metadata=1.2 num-devices=2 UUID=ac8855bc:3923382f:3832a119:52003576 name=np04-srv-003:pv.1
   devices=/dev/sdb2,/dev/sda2
ARRAY /dev/md/0  level=raid5 metadata=1.2 num-devices=11 UUID=548c82e0:f212d403:b7f832cb:f1dbc977 name=np04-srv-003:0
   spares=1   devices=/dev/sdl,/dev/sdm,/dev/sdn,/dev/sdk,/dev/sdi,/dev/sde,/dev/sdg,/dev/sdh,/dev/sdd,/dev/sdj,/dev/sdc,/dev/sdf
ARRAY /dev/md/1  level=raid5 metadata=1.2 num-devices=11 UUID=28a9f930:3b42f84b:ba0e53b4:0e6800dd name=np04-srv-003:1
   spares=1   devices=/dev/sdy,/dev/sdx,/dev/sdw,/dev/sdv,/dev/sdp,/dev/sdr,/dev/sdo,/dev/sdz,/dev/sdq,/dev/sdt,/dev/sds,/dev/sdu
ARRAY /dev/md/2  level=raid5 metadata=1.2 num-devices=11 UUID=58187368:a15f5263:7314ce18:8bd396d7 name=np04-srv-003:2
   spares=1   devices=/dev/sdad,/dev/sdah,/dev/sdaj,/dev/sdak,/dev/sdaf,/dev/sdab,/dev/sdag,/dev/sdal,/dev/sdai,/dev/sdac,/dev/sdaa,/dev/sdae
ARRAY /dev/md/3  level=raid5 metadata=1.2 num-devices=11 UUID=dbe7277a:eafea0dc:db0585fa:62f6db26 name=np04-srv-003:3
   spares=1   devices=/dev/sdar,/dev/sdao,/dev/sdap,/dev/sdaq,/dev/sdan,/dev/sdaw,/dev/sdam,/dev/sdav,/dev/sdat,/dev/sdax,/dev/sdas,/dev/sdau

[root@np04-srv-003 ~]# mdadm --assemble /dev/md2 /dev/sda[a-i] /dev/sd[k-l] /dev/sdq
mdadm: /dev/sdk is busy - skipping
mdadm: /dev/sdl is busy - skipping
mdadm: /dev/sdq is busy - skipping
mdadm: /dev/md2 assembled from 9 drives - not enough to start the array.

[root@np04-srv-003 ~]# mdadm --assemble --scan
mdadm: /dev/md/0 has been started with 11 drives and 1 spare.
mdadm: timeout waiting for /dev/md/0
mdadm: /dev/md/1 has been started with 11 drives and 1 spare.
mdadm: timeout waiting for /dev/md/1
mdadm: failed to RUN_ARRAY /dev/md/2: Input/output error
mdadm: /dev/md/3 has been started with 11 drives and 1 spare.
mdadm: timeout waiting for /dev/md/3
mdadm: failed to RUN_ARRAY /dev/md/2: Input/output error

Update Kernel on np04-srv-002-ctrl

Instructions at http://elrepo.org/tiki/kernel-lt. Selected the "long term" (lt) support version. We don't need the latest and greatest. Just enough support for some additional disk access tuning.

Kernel Listing

Installed Kernels

[root@np04-srv-002 ~]# rpm -qa kernel
kernel-3.10.0-693.el7.x86_64

---+++ Running Kernel
[root@np04-srv-002 ~]# uname -r
3.10.0-693.el7.x86_64

Available Kernels

[root@np04-srv-002 ~]# yum list --enablerepo=elrepo-kernel | grep kernel-lt
kernel-lt.x86_64                            4.4.126-1.el7.elrepo       elrepo-kernel
kernel-lt-devel.x86_64                      4.4.126-1.el7.elrepo       elrepo-kernel
kernel-lt-doc.noarch                        4.4.126-1.el7.elrepo       elrepo-kernel
kernel-lt-headers.x86_64                    4.4.126-1.el7.elrepo       elrepo-kernel
kernel-lt-tools.x86_64                      4.4.126-1.el7.elrepo       elrepo-kernel
kernel-lt-tools-libs.x86_64                 4.4.126-1.el7.elrepo       elrepo-kernel
kernel-lt-tools-libs-devel.x86_64           4.4.126-1.el7.elrepo       elrepo-kernel

Install Kernel

yum --enablerepo=elrepo-kernel install kernel-lt

Then reboot. The last kernel installed gets loaded at boot.

np04-srv-002

Remove Old Raids

md127 and md126 are the mirrored system disks. Don't removed these.

[root@np04-srv-002 ~]# fdisk -l | grep md
WARNING: fdisk GPT support is currently new, and therefore in an experimental phase. Use at your own discretion.
Disk /dev/md127: 1049 MB, 1049034752 bytes, 2048896 sectors
Disk /dev/md126: 845.8 GB, 845840121856 bytes, 1652031488 sectors
Disk /dev/md125: 54009.4 GB, 54009367363584 bytes, 105487045632 sectors
Disk /dev/md124: 54009.4 GB, 54009367363584 bytes, 105487045632 sectors
Disk /dev/md123: 48008.3 GB, 48008326545408 bytes, 93766262784 sectors
Disk /dev/md122: 54009.4 GB, 54009367363584 bytes, 105487045632 sectors

[root@np04-srv-002 ~]# mdadm --stop /dev/md122
mdadm: stopped /dev/md122
[root@np04-srv-002 ~]# mdadm --stop /dev/md123
mdadm: stopped /dev/md123
[root@np04-srv-002 ~]# mdadm --stop /dev/md124
mdadm: stopped /dev/md124
[root@np04-srv-002 ~]# mdadm --stop /dev/md125
mdadm: stopped /dev/md125
[root@np04-srv-002 ~]# 

Configurations

Commands Issued

Increase Performance

As a result we get ~2.5 GB/s sequential write performance which is only affected partially by reading (we get ~4GB/s sequential read perf).
  1. the oflag=dsync option of dd artificially lowers the performance, forcing a synchronisation that is not really needed. Therefore, just changing the way we did measurements (oconv=fsync) already brought the performance up by 25% (400 MB/s -> 500 MB/s).
    • dd if=/dev/zero of=/data3/test/test3.img bs=1G count=1 conv=fsync
  2. We increased progressively the number of threads in /sys/block/md*/md/group_thread_cnt and found a reasonable plateau at 4.
    • echo 4 > /sys/block/md0/md/group_thread_cnt
    • echo 4 > /sys/block/md1/md/group_thread_cnt
    • echo 4 > /sys/block/md2/md/group_thread_cnt
    • echo 4 > /sys/block/md3/md/group_thread_cnt
  3. We reduced the dirty_background_ratio and dirty_ratio, in order to reduce RAM utilisation (this may be tuned once we know better what will run on those nodes)
    • echo 1 > /proc/sys/vm/dirty_background_ratio
    • echo 2 > /proc/sys/vm/dirty_ratio
  4. We set the read_ahead to 65536
    • for i in `seq 0 3`; do blockdev --setra 65536 /dev/md$i ; done
  5. we increased the min/max sync speeds:
    • echo 50000 > /proc/sys/dev/raid/speed_limit_min
    • echo 5000000 > /proc/sys/dev/raid/speed_limit_max

Create RAID Sets

The configuration has 4 independent devices with 12 disks each (one of which declared as spare, such that rebuilding will start without human intervention).
  • data0
    • mdadm --create /dev/md0 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sd[c-n]
    • mkfs.xfs /dev/md0
    • mount /dev/md0 /data0
  • data1
    • mdadm --create /dev/md1 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sd[o-z]
    • mkfs.xfs /dev/md1
    • mount /dev/md1 /data1
  • data2
    • mdadm --create /dev/md2 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sda[a-l]
    • mkfs.xfs /dev/md2
    • mount /dev/md2 /data2
  • data3
    • mdadm --create /dev/md3 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sda[m-x]
    • mkfs.xfs /dev/md3
    • mount /dev/md3 /data3

mkdir /data0
mkdir /data1
mkdir /data2
mkdir /data3

chmod g+w /data0
chmod g+w /data1
chmod g+w /data2
chmod g+w /data3
chmod o+w /data0
chmod o+w /data1
chmod o+w /data2
chmod o+w /data3

Check RAID Status

mdadm --detail /dev/md0

Save RAID configuration

mdadm --examine --scan >> mdadm.conf

[root@np04-srv-001 ~]# mdadm --examine --scan
ARRAY /dev/md/boot  metadata=1.0 UUID=fda41cd4:4be92519:f6cabecb:8887200c name=np04-srv-001:boot
ARRAY /dev/md/pv.1  metadata=1.2 UUID=9ea4f570:4d700476:2d64744e:9b4510af name=np04-srv-001:pv.1
ARRAY /dev/md/0  metadata=1.2 UUID=374d1081:011c7331:1a7f9aac:21a63a9d name=np04-srv-001:0  spares=1
ARRAY /dev/md/1  metadata=1.2 UUID=4a387f71:ed7b555d:112b21cd:fc9e7d48 name=np04-srv-001:1  spares=1
ARRAY /dev/md/2  metadata=1.2 UUID=48791578:8c106a52:4da28655:775d5f99 name=np04-srv-001:2  spares=1
ARRAY /dev/md/3  metadata=1.2 UUID=3aed8074:69217090:467ccaf4:1a544349 name=np04-srv-001:3  spares=1

Add filesystems to /etc/fstab

/dev/md0  /data0  xfs  defaults  0 0 
/dev/md1  /data1  xfs  defaults  0 0 
/dev/md2  /data2  xfs  defaults  0 0 
/dev/md3  /data3  xfs  defaults  0 0 

Performance and Configuration Report from Giovanna

Received on Friday, December 08, 2017 11:00.

Dear Geoff,
Thanks to Wainer and the testing session we just had, I think that we can now prepare the “final" configuration of the storage servers.
Please find enclosed a summary of our tests.

1) the oflag=dsync option of dd artificially lowers the performance, forcing a synchronisation that is not really needed. Therefore, just changing the way we did measurements (oconv=fsync) already brought the performance up by 25% (400 MB/s -> 500 MB/s)

2) We increased progressively the number of threads in /sys/block/md*/md/group_thread_cnt and found a reasonable plateau at 4.
~> for i in  /sys/block/md*/md/group_thread_cnt; do echo 4 > $i;done

3) We reduced the dirty_background_ratio and dirty_ratio, in order to reduce RAM utilisation (this may be tuned once we know better what will run on those nodes)
~> echo 1 > /proc/sys/vm/dirty_background_ratio
~> echo 2 > /proc/sys/vm/dirty_ratio
 
4) We set the read_ahead to 65536
~> for i in `seq 0 3`; do blockdev — setra 65536 /dev/md$i ; done

5) we increased the min/max sync speeds:
~> echo 50000 > /proc/sys/dev/raid/speed_limit_min
~> echo 5000000 > /proc/sys/dev/raid/speed_limit_max

As a result we get ~2.5 GB/s sequential write performance which is only affected partially by reading (we get ~4GB/s sequential read perf).

Last thing to decide is if we bother using RAID6 or we are happy with RAID5. 
RAID 5 will give us a small performance increase (5-10%) as well as disk space increase (8%), while RAID 6 allows us to not loose data even if a disk breaks while we are recovering from a disk failure. Since disks don’t die like flies and our data are meant to be short-lived anyway, I would tend to go for RAID5.

The configuration has 4 independent devices with 12 disks each (one of which declared as spare, such that rebuilding will start without human intervention).

Just for reference, the creation command can be (for 1 device):
~> mdadm —create —verbose /dev/md0 —level=5 —raid-devices=11 /dev/sdaa /dev/sdab /dev/sdac /dev/sdad /dev/sdae /dev/sdaf /dev/sdag /dev/sdah /dev/sdai /dev/sdaj /dev/sdak —spare-devices=1 /dev/sdal
~> mkfs.xfs /dev/md0
~> mount /dev/md0 /data0

In /etc/mdadm.conf we should specify an email address to get notified of any failures and make sure that the mdmonitor.service is running correctly.
Example:
[root@np04-srv-002 ~]# cat /etc/mdadm.conf
MAILADDR giovanna.lehmann@cern.ch

Ciao
Giovanna

[root@np04-srv-002 ~]# mdadm --create /dev/md0 --verbose --level=5 --raid-devices=11 --spare-devices=1 /dev/sd[c-n]
mdadm: layout defaults to left-symmetric
mdadm: layout defaults to left-symmetric
mdadm: chunk size defaults to 512K
mdadm: partition table exists on /dev/sdc
mdadm: partition table exists on /dev/sdc but will be lost or
       meaningless after creating array
mdadm: /dev/sdd appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sde appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdf appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdg appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdh appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdi appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdj appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdk appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdl appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdm appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: /dev/sdn appears to be part of a raid array:
       level=raid6 devices=11 ctime=Thu Oct 19 10:50:43 2017
mdadm: size set to 5860391424K
mdadm: automatically enabling write-intent bitmap on large array
Continue creating array? y
mdadm: Defaulting to version 1.2 metadata
mdadm: array /dev/md0 started.

-- DavidGeoffreySavage - 2018-03-13

Edit | Attach | Watch | Print version | History: r12 < r11 < r10 < r9 < r8 | Backlinks | Raw View | WYSIWYG | More topic actions
Topic revision: r12 - 2018-10-05 - DavidGeoffreySavage
 
    • Cern Search Icon Cern Search
    • TWiki Search Icon TWiki Search
    • Google Search Icon Google Search

    Main All webs login

This site is powered by the TWiki collaboration platform Powered by PerlCopyright &© 2008-2019 by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback