Thanks, Blair! I pasted more detail configurations/result on host/vm here.
* mpirun result (bandwidth, 4KB message size)
- between 2 hosts
mpirun -np 2 -host A,B : 3798.62 MB/s
mpirun -np 2 -host B,A : 3790.37 MB/s
- between 1 host and the other host's VM
mpirun -np 2 -host A,B_vm : 3554.95 MB/s
mpirun -np 2 -host B_vm,A : 804.30 MB/s
mpirun -np 2 -host B,A_vm : 3433.93 MB/s
mpirun -np 2 -host A_vm,B : 834.83 MB/s
- between 2 VMs on different host
mpirun -np 2 -host A_vm,B_vm : 796.67 MB/s
mpirun -np 2 -host B_vm,A_vm : 789.85 MB/s
* A host
[root@A tmp]# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 16
On-line CPU(s) list: 0-15
Thread(s) per core: 1
Core(s) per socket: 8
Socket(s): 2
NUMA node(s): 2
Vendor ID: GenuineIntel
CPU family: 6
Model: 45
Model name: Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz
Stepping: 7
CPU MHz: 1200.000
BogoMIPS: 3993.96
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 20480K
NUMA node0 CPU(s): 0-7
NUMA node1 CPU(s): 8-15
[root@A tmp]# numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7
node 0 size: 24541 MB
node 0 free: 484 MB
node 1 cpus: 8 9 10 11 12 13 14 15
node 1 size: 24575 MB
node 1 free: 21446 MB
node distances:
node 0 1
0: 10 20
1: 20 10
* B host
[root@B tmp]# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 16
On-line CPU(s) list: 0-15
Thread(s) per core: 1
Core(s) per socket: 8
Socket(s): 2
NUMA node(s): 2
Vendor ID: GenuineIntel
CPU family: 6
Model: 45
Model name: Intel(R) Xeon(R) CPU E5-2650 0 @ 2.00GHz
Stepping: 7
CPU MHz: 1200.000
BogoMIPS: 3993.95
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 20480K
NUMA node0 CPU(s): 0-7
NUMA node1 CPU(s): 8-15
[root@B tmp]# numactl -H
available: 2 nodes (0-1)
node 0 cpus: 0 1 2 3 4 5 6 7
node 0 size: 24541 MB
node 0 free: 7483 MB
node 1 cpus: 8 9 10 11 12 13 14 15
node 1 size: 24575 MB
node 1 free: 23911 MB
node distances:
node 0 1
0: 10 20
1: 20 10
* A host's VM
[root@A_vm]# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 8
On-line CPU(s) list: 0-7
Thread(s) per core: 1
Core(s) per socket: 1
Socket(s): 8
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 13
Stepping: 3
CPU MHz: 1995.192
BogoMIPS: 3990.38
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 32K
L1i cache: 32K
L2 cache: 4096K
NUMA node0 CPU(s): 0-7
[root@A_vm]# numactl -H
available: 1 nodes (0)
node 0 cpus: 0 1 2 3 4 5 6 7
node 0 size: 15624 MB
node 0 free: 14788 MB
node distances:
node 0
0: 10
* B host's VM
[root@B_vm]# lscpu
Architecture: x86_64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 8
On-line CPU(s) list: 0-7
Thread(s) per core: 1
Core(s) per socket: 1
Socket(s): 8
NUMA node(s): 1
Vendor ID: GenuineIntel
CPU family: 6
Model: 13
Stepping: 3
CPU MHz: 1995.191
BogoMIPS: 3990.38
Hypervisor vendor: KVM
Virtualization type: full
L1d cache: 32K
L1i cache: 32K
L2 cache: 4096K
NUMA node0 CPU(s): 0-7
[root@B_vm]# numactl -H
available: 1 nodes (0)
node 0 cpus: 0 1 2 3 4 5 6 7
node 0 size: 15624 MB
node 0 free: 14791 MB
node distances:
node 0
0: 10
* libvirt xml
[root@hp4 pt2pt]# cat /tmp/test.xml
<domain type="kvm">
<uuid>ab14e717-90a9-4085-9a32-f0b24430b2c0</uuid>
<name>test</name>
<memory>16000000</memory>
<cpu>
<numa>
<cell id='0' cpus='0-7' memory="16000000" unit='KiB'/>
</numa>
</cpu>
<vcpu>8</vcpu>
<sysinfo type="smbios">
<system>
<entry name="manufacturer">RDO Project</entry>
<entry name="product">OpenStack Nova</entry>
<entry name="version">2014.1.3-2.el7.centos</entry>
<entry name="serial">16353439-3339-5553-4532-333845585934</entry>
<entry name="uuid">ab14e717-90a9-4085-9a32-f0b24430b2c0</entry>
</system>
</sysinfo>
<os>
<type>hvm</type>
<boot dev="hd"/>
<smbios mode="sysinfo"/>
</os>
<features>
<acpi/>
<apic/>
</features>
<clock offset="utc">
<timer name="pit" tickpolicy="delay"/>
<timer name="rtc" tickpolicy="catchup"/>
<timer name="hpet" present="no"/>
</clock>
<cpu mode="host-model" match="exact"/>
<devices>
<disk type="file" device="disk">
<driver name="qemu" type="qcow2" cache="none"/>
<source file="/tmp/disk"/>
<target bus="virtio" dev="vda"/>
</disk>
<interface type='hostdev' managed='yes'>
<source>
<address type='pci' bus="0x07" domain="0x0" function="0x2" slot="0x01"/>
</source>
<mac address='5a:16:3e:6c:d9:1f'/>
<vlan>
<tag id='1000'/>
</vlan>
</interface>
<serial type='pty'>
<target port='0'/>
</serial>
<console type='pty'>
<target type='serial' port='0'/>
</console>
</devices>
</domain>