[comp.os.mach] Mach context switch time

hue@netcom.UUCP (Jonathan Hue) (10/12/89)

>        Mean real time per context switch, corresponding context switch rate:
>
>        PDP-11/45       EPOS              170 usec      5,882 / sec
>        DECStation 3100 ULTRIX 2.0 R7     390           2,654
>        Sun 4/280       SunOs Sys4-3.2R2  623           1,605
>        VAX 8650        BSD 4.3           723           1,383
>        NeXt            MACH              873           1,145

The following program executes the loop 10000 times in 7.3 seconds on a
NeXT.  Each time through the loop is 2 context switches, so this works
out to 2740 context switches/sec, or 365us per context switch.

#include <sys/types.h>
#include <stdio.h>
#include <mach.h>
#include <sys/message.h>

struct mymsg
{
    msg_header_t	my_header;
    msg_type_t		my_type;
};

main(argc, argv)
int argc;
char **argv;
{
    register int iterations=1;
    register kern_return_t error;
    port_t port, newport;
    unsigned int num_ports;
    struct mymsg msg;
    port_t port_set[10];
    port_array_t new_set;

    if (argc == 2)
	iterations = atoi(*++argv);
    if ((error = port_allocate(task_self(), &port)) != KERN_SUCCESS)
	mach_error("port_allocate", error);
    if ((error = port_set_backlog(task_self(), port, 10)) != KERN_SUCCESS)
	mach_error("port_set_backlog", error);
    port_set[0] = port;
    mach_ports_register(task_self(), port_set, 1);
    switch (fork())
    {
	case -1:
	    perror("fork");
	    exit(1);
	case 0:
	    mach_ports_lookup(task_self(), &new_set, &num_ports);
	    port = *new_set;
	    msg.my_header.msg_remote_port = port;
	    if ((error = port_allocate(task_self(), &newport)) != KERN_SUCCESS)
		mach_error("port_allocate", error);
	    msg.my_header.msg_remote_port = port;
	    msg.my_header.msg_local_port = newport;
	    msg.my_header.msg_id = 0xc0ffee;
	    msg.my_header.msg_size = sizeof(msg);
	    msg.my_header.msg_type = MSG_TYPE_NORMAL;
	    msg.my_header.msg_simple = TRUE;
	    
	    msg.my_type.msg_type_name = MSG_TYPE_INTEGER_32;
	    msg.my_type.msg_type_size = 32;
	    msg.my_type.msg_type_number = 0;
	    msg.my_type.msg_type_inline = TRUE;
	    msg.my_type.msg_type_longform = FALSE;
	    msg.my_type.msg_type_deallocate = FALSE;
	    while (--iterations != -1)
	    {
		if ((error = msg_rpc(&(msg.my_header), SEND_SWITCH,
				     sizeof(msg), 0, 0)) != RPC_SUCCESS)
		    mach_error("msg_send", error);
	    }
	    exit(0);
	default:
	    msg.my_header.msg_local_port = port;
	    msg.my_header.msg_size = sizeof(msg);
	    while (--iterations != -1)
	    {
		if ((error = msg_receive(&(msg.my_header), MSG_OPTION_NONE, 0))
		    != RCV_SUCCESS)
		    mach_error("msg_receive", error);
		if ((error = msg_send(&(msg.my_header), SEND_SWITCH, 0)) !=
		     SEND_SUCCESS)
		    mach_error("msg_receive", error);
	    }
	    break;
    }
    wait(0);
}


-Jonathan

Richard.Draves@CS.CMU.EDU (10/15/89)

In a following message, I'll post some performance numbers in response
to Ken Birman's request for a comparison of Mach IPC and Unix IPC.  But
first I would like to comment on this program.

> Excerpts from netnews.comp.os.mach: 12-Oct-89 Mach context switch time
> Jonathan Hue@netcom.UUCP (2816)

> #include <sys/types.h>
> #include <stdio.h>
> #include <mach.h>
> #include <sys/message.h>

> struct mymsg
> {
>     msg_header_t	my_header;
>     msg_type_t		my_type;
> };

> main(argc, argv)
> int argc;
> char **argv;
> {
>     register int iterations=1;
>     register kern_return_t error;
>     port_t port, newport;
>     unsigned int num_ports;
>     struct mymsg msg;
>     port_t port_set[10];
>     port_array_t new_set;

>     if (argc == 2)
> 	iterations = atoi(*++argv);
>     if ((error = port_allocate(task_self(), &port)) != KERN_SUCCESS)
> 	mach_error("port_allocate", error);
>     if ((error = port_set_backlog(task_self(), port, 10)) !=
> KERN_SUCCESS)
> 	mach_error("port_set_backlog", error);
>     port_set[0] = port;
>     mach_ports_register(task_self(), port_set, 1);
>     switch (fork())
>     {
> 	case -1:
> 	    perror("fork");
> 	    exit(1);
> 	case 0:
> 	    mach_ports_lookup(task_self(), &new_set, &num_ports);
> 	    port = *new_set;
> 	    msg.my_header.msg_remote_port = port;
> 	    if ((error = port_allocate(task_self(), &newport)) != KERN_SUCCESS)
> 		mach_error("port_allocate", error);
> 	    msg.my_header.msg_remote_port = port;
> 	    msg.my_header.msg_local_port = newport;
> 	    msg.my_header.msg_id = 0xc0ffee;
> 	    msg.my_header.msg_size = sizeof(msg);
> 	    msg.my_header.msg_type = MSG_TYPE_NORMAL;
> 	    msg.my_header.msg_simple = TRUE;
> 	    
> 	    msg.my_type.msg_type_name = MSG_TYPE_INTEGER_32;
> 	    msg.my_type.msg_type_size = 32;
> 	    msg.my_type.msg_type_number = 0;
> 	    msg.my_type.msg_type_inline = TRUE;
> 	    msg.my_type.msg_type_longform = FALSE;
> 	    msg.my_type.msg_type_deallocate = FALSE;
> 	    while (--iterations != -1)
> 	    {
> 		if ((error = msg_rpc(&(msg.my_header), SEND_SWITCH,
> 				     sizeof(msg), 0, 0)) != RPC_SUCCESS)
> 		    mach_error("msg_send", error);
> 	    }
> 	    exit(0);
> 	default:
> 	    msg.my_header.msg_local_port = port;
> 	    msg.my_header.msg_size = sizeof(msg);
> 	    while (--iterations != -1)
> 	    {
> 		if ((error = msg_receive(&(msg.my_header), MSG_OPTION_NONE, 0))
> 		    != RCV_SUCCESS)
> 		    mach_error("msg_receive", error);
> 		if ((error = msg_send(&(msg.my_header), SEND_SWITCH, 0)) !=
> 		     SEND_SUCCESS)
> 		    mach_error("msg_receive", error);
> 	    }
> 	    break;
>     }
>     wait(0);
> }


It is possible to use messages which have no body, just a header.  So
"struct mymsg" need not include "my_type".

There is no need to use port_set_backlog here, although it certainly
doesn't hurt.

My personal preference is to avoid mach_ports_register and
mach_ports_lookup when possible.  They are a hack that lets tasks
acquire some initial send rights, like for the name service.  For your
purposes, why not use netname_check_in and netname_look_up?

(OK, I can think of a reason you might have used mach_ports_register
and mach_ports_lookup.  They will work single-user, when the name
service isn't up.  I have a simple name server which I use when the
netmsgserver isn't running or available.)

The benchmark has the server include rights for the reply port (newport)
in the reply message.  Mig makes the msg_local_port field in reply
messages be PORT_NULL; this is a little faster because the kernel only
has to handle one port in the reply message instead of two.

On most architectures, there is no problem with having the benchmark
program fork to get client and server tasks.  However, this doesn't work
very well on some machines, like RTs.  The problem is hardware
architectures which don't allow convenient sharing of physical pages. 
The RT only allows sharing of segments.  The RT pmap module
(machine-dependent VM module) isn't smart enough to figure out that the
text pages of the child and parent can be shared by using a single
segment; it uses two segments and shuffles the pages back and forth. 
What this means is that on the RT, the benchmark will be taking some
faults on every context switch.  These are relatively inexpensive
faults; they just need to fiddle with the RT's hardware data structures.
 But you probably don't want to be measuring them.  Other architectures
(I don't know of any off-hand) might suffer from similar problems.

The SEND_SWITCH option for msg_rpc and msg_send is something that NeXT
decided to export to the user; in Mach 2.5, it is only available
internally.  It is a scheduling hint.  If SEND_SWITCH is used when a
message is sent, and a receiver is waiting, then the kernel will
context-switch immediately to the receiver.  Normally the sender keeps
running, and the receiver won't run until normal scheduling picks it.

msg_rpc turns on SEND_SWITCH internally, so there is no reason for a
NeXT user to use SEND_SWITCH with msg_rpc.  I doubt the SEND_SWITCH on
the msg_send is doing much for the benchmark either.  Normally when
doing repeated RPCs, client using msg_rpc and server using
msg_receive/msg_send, things work as follows.  The server is blocked in
msg_receive.  The client executes msg_rpc.  Because of SEND_SWITCH is
used internally, we switch to the server.  The server uses msg_send for
the reply.  SEND_SWITCH wouldn't do anything, because the client hasn't
gotten to the receive part of its msg_rpc yet.  server executes
msg_receive and blocks.  scheduler picks client, which resumes the
msg_rpc and goes to receive the reply.  It picks up the reply message
sitting there and loops around for another msg_rpc, etc.

With SEND_SWITCH on the msg_send, another mode of operation is possible.
 The client is blocked in the receive part of the msg_rpc, and the
server executes msg_send with SEND_SWITCH, so we switch to the client. 
The client loops around and does another msg_rpc.  In the send part, the
server isn't blocked in its receive yet, so the internal SEND_SWITCH
does nothing.  The client keeps going and blocks in the receive part
again.  The scheduler picks the server, which comes out of the msg_send,
executes the msg_receive, and executes the msg_send again, etc.

I expect the time for an RPC is about the same in these two modes,
although I haven't checked that.  In any case, the benchmark is probably
vacillating between them as scheduling quanta and other things can flip
the system from one mode to another.

Rich

raveling@isi.edu (Paul Raveling) (10/17/89)

In article <2895@netcom.UUCP>, hue@netcom.UUCP (Jonathan Hue) writes:
> 
> >        Mean real time per context switch, corresponding context
switch rate:
> >        NeXt            MACH              873           1,145
> 
> The following program executes the loop 10000 times in 7.3 seconds on a
> NeXT.  Each time through the loop is 2 context switches, so this works
> out to 2740 context switches/sec, or 365us per context switch.

	This is suggests i/o system overhead for the short
	(10-byte) piped messages that pt uses is about 500
	microseconds.

	BTW, someone else ran the pt benchmark on a NeXt machine
	and reported 920 microseconds per (context switch + msg
	transmission).  Timing may vary a bit among individual
	machines.


----------------
Paul Raveling
Raveling@isi.edu

phb@imag.imag.fr (Philippe Bernadat) (10/23/89)

In article <2895@netcom.UUCP> hue@netcom.UUCP (Jonathan Hue) writes:
>
>The following program executes the loop 10000 times in 7.3 seconds on a
>NeXT.  Each time through the loop is 2 context switches, so this works
>out to 2740 context switches/sec, or 365us per context switch.
>

Running the same program on a NeXT and on a DS3100 I got the following numbers:


		elapsed time for 	context switch		context switch
		10,000 loops		time (usec)		per second

NeXT			7.6 			380			2,631
DS3100 (Mach 2.5)	2.3 			115			8,695


+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+  Philippe Bernadat
EMAIL: bernadat@ri.osf.fr            OSF Research Institute
PHONE: (33) 76 42 82 41              2 Avenue Vignate
FAX:   (33) 76 54 03 99              38610 Gieres      FRANCE
-- 
+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+  Philippe Bernadat
EMAIL: bernadat@ri.osf.fr            OSF Research Institute
PHONE: (33) 76 42 82 41              2 Avenue Vignate
FAX:   (33) 76 54 03 99              38610 Gieres      FRANCE

raveling@isi.edu (Paul Raveling) (10/28/89)

In article <6393@imag.imag.fr>, phb@imag.imag.fr (Philippe Bernadat) writes:

> Running the same program on a NeXT and on a DS3100 I got the following
numbers:
> 
> 
> 		elapsed time for 	context switch		context switch
> 		10,000 loops		time (usec)		per second
> 
> NeXT			7.6 			380			2,631
> DS3100 (Mach 2.5)	2.3 			115			8,695

	Interesting...  Back when I first looked at Mach's native
	kernel interface my guess was that Mach would be about
	a factor of 3 faster than a "plain vanilla" Unix.  Comparing
	with the pt benchmark (vanilla kernel functions, if not
	pure vanilla kernel), shows:

					     "vanilla"	Mach	Ratio

	DS3100,  ULTRIX 2.0 R7  & Mach 2.5	390	115	3.4
	DS3100,  ULTRIX 2.1 R14 & Mach 2.5	230	115	2.0
	NeXt,	 Mach:  Version not noted	873	380	2.3
	NeXt,	 Mach:  NeXt 1.0		920	380	2.4


	Another milestone is that the 3100 Mach benchmark is the first
	context switch time I've heard reported that's faster than either:

	    170 microseconds  --  PDP-11/45 running EPOS
	    140 microseconds  --  4 MHz Z-80 running a cut-down variant
					of an EPOS kernel


----------------
Paul Raveling
Raveling@isi.edu

Richard.Draves@CS.CMU.EDU (10/28/89)

What does this "pt" benchmark do?  Did I miss seeing source for it?

The Mach context-switch benchmark that Jonathan Hue posted can be
improved, if one wishes to measure a fast context-switch.  Both sides
can use msg_rpc, instead of one side using msg_rpc and the other
msg_receive/msg_send.  However, I wouldn't quote such a time as
measuring a typical client-server RPC.

Rich