Using Parallel LINQ in C#

The snippets below illustrate Parallel LINQ in C# by crawling a web page in parallel. The Crawler class is responsible for crawling a web page (only one level deep) and returning a list of documents. The Document class represents an HTML document.

Document class

using System;

namespace PLINQ_Crawler_Example
{
	public class Document
	{
		public Uri Url {
			get;
			set;
		}
		public string Html {
			get;
			set;
		}
	}
}

Crawler class

using System;
using System.Collections.Generic;
using System.Net;
using System.Xml;
using System.Data.Linq;
using System.Linq;
using System.Linq.Parallel;
using System.Threading;

namespace PLINQ_Crawler_Example
{
	public class Crawler
	{
		public int DegreeOfParallelism {
			get;
			set;
		}

		public Crawler (int degreeOfParallelism)
		{
			this.DegreeOfParallelism = degreeOfParallelism;
		}

		public IEnumerable<Document> Crawl (string crawlUrl, string filter, int waitTime)
		{
			List<Document> documents = new List<Document> ();
			Uri url = new Uri (crawlUrl);

			using (WebClient webClient = new WebClient()) {
				Console.WriteLine (string.Format ("Crawling {0} with filter {1}", crawlUrl, filter));

				string htmlContent = webClient.DownloadString (url);

				documents.Add (new Document (){
					Html = htmlContent,
					Url = url
				});

				XmlDocument xmlDocument = new XmlDocument ();
				xmlDocument.LoadXml (htmlContent);

				var linkNodes = xmlDocument.SelectNodes (string.Format ("//a[contains(@href, '{0}')]", filter));

				Console.WriteLine (string.Format ("Found {0} links", linkNodes.Count));

				string baseUrl = string.Format ("{0}://{1}", url.Scheme, url.Authority);

				List<string> links = new List<string> ();

				for (int i=0; i<linkNodes.Count; i++) {
					var linkNode = linkNodes.Item (i);
					links.Add (string.Format ("{0}{1}", baseUrl, linkNode.Attributes ["href"].Value as string));
				}

				// Fetch all links using PLINQ
				links.AsParallel ().WithDegreeOfParallelism (DegreeOfParallelism).ForAll (link => {
					Uri linkUrl = new Uri (link);

					using (WebClient linkWebClient = new WebClient()) {
						try {
							string linkHtml = linkWebClient.DownloadString (linkUrl);

							documents.Add (new Document (){
								Html = linkHtml,
								Url = linkUrl
							});

							Console.WriteLine (string.Format ("Fetched {0}", link));

						} catch {
						}
					}

					Thread.Sleep (waitTime);
				});

			}

			return documents;
		}
	}
}

The main program

Three concurrent requests are sent at a time and a waiting time of 3000 ms has been introduced to put less pressure on the Wikipedia servers.

using System;
using System.Collections.Generic;
using System.Linq;

namespace PLINQ_Crawler_Example
{
	class MainClass
	{
		public static void Main (string[] args)
		{
			// Settings
			string crawlUrl = "http://en.wikipedia.org/wiki/Example";
			string filter = "/wiki/";
			int degreeOfParallelism = 3;
			int waitTime = 3000;

			// Start crawling
			Crawler crawler = new Crawler(degreeOfParallelism);
			IEnumerable<Document> documents = crawler.Crawl(crawlUrl, filter, waitTime);

			int totalBytes = documents.Sum(document => document.Html.Length);

			Console.WriteLine(string.Format("Fetched {0} documents with a total size of {1} bytes",
			                  documents.Count(), totalBytes));

			Console.ReadLine();
		}
	}
}

Output

Crawling http://en.wikipedia.org/wiki/Example with filter /wiki/
Found 51 links
Fetched [...]
Fetched http://en.wikipedia.org/wiki/Help:Contents
Fetched http://en.wikipedia.org/wiki/Wikipedia:About
Fetched 32 documents with a total size of 2217436 bytes

The solution file can be downloaded from here.

This entry was posted in C# and tagged , , , , , , , , , , . Bookmark the permalink. Trackbacks are closed, but you can post a comment.

Post a Comment

Your email is never published nor shared. Required fields are marked *

*
*

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>

Why ask?