To screen scrape web pages generated by ASP.NET and delivered via HTTPS, you can still use HttpWebRequest, but there are several "gotchas" to keep in mind:
private static void Main()
{
request = CreateRequest("https:
//url-here");
request.Headers.Add("Authorization", "Basic " + Convert.ToBase64String(ASCIIEncoding.ASCII.GetBytes("username:password")));
HttpWebResponse resp = (HttpWebResponse)request.GetResponse();
string respHtml =
new StreamReader(resp.GetResponseStream()).ReadToEnd();
ParseHtml(respHtml);
// TODO: implement your parser function here
NameValueCollection form = ExtractHiddenFields(respHtml);
// For paginated responses, get next page (TODO: replace my "next page available" logic with your own)
while (respHtml.IndexOf("<a id=\"ctl00_ContentPlaceHolder2_lbNext\" disabled=\"disabled\">Next</a></li>") < 0)
{
request = CreateRequest("https:
//url-here");
request.Method = "POST";
request.Headers.Add("X-MicrosoftAjax", "Delta=
true");
request.ContentType = "application/x-www-form-urlencoded";
using (StreamWriter w =
new StreamWriter(request.GetRequestStream()))
{
form["__EVENTTARGET"] = "ctl00$ContentPlaceHolder2$lbNext";
// TODO: replace my event target with yours
form["ctl00$ScriptManager1"] = "ctl00$ContentPlaceHolder2$UpdatePanel1|ctl00$ContentPlaceHolder2$lbNext";
// TODO: same here
w.Write(GetPayloadString(form));
w.Flush();
}
resp = (HttpWebResponse)request.GetResponse();
respHtml =
new StreamReader(resp.GetResponseStream()).ReadToEnd();
ParseHtml(respHtml);
ExtractHiddenFieldsFromAjax(respHtml, form);
}
}
public static NameValueCollection ExtractHiddenFields(
string html)
{
NameValueCollection form =
new NameValueCollection();
Regex hiddenPattern =
new Regex("<input type=\"hidden\" name=\"([^\"]*)\" id=\"[^\"]*\"
value=\"([^\"]*)\" />");
MatchCollection matches = hiddenPattern.Matches(html);
foreach (Match match
in matches)
{
form.Add(match.Groups[1].Value, match.Groups[2].Value);
}
return form;
}
public static NameValueCollection ExtractHiddenFieldsFromAjax(
string html, NameValueCollection form)
{
form = form ??
new NameValueCollection();
while (html.Length > 0)
{
string[] parts = html.Split(
new[] { '|' }, 4);
int valueLength =
int.Parse(parts[0]);
if (parts[1] == "hiddenField")
{
form[parts[2]] = parts[3].Substring(0, valueLength);
}
html = html.Substring(parts[0].Length + parts[1].Length + parts[2].Length + 4 + valueLength);
}
return form;
}
public static HttpWebRequest CreateRequest(
string uri)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);
request.CookieContainer = s_cc;
// reuse cookie contianer across requests
request.UserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.13) Gecko/20100914 Firefox/3.5.13 (.NET CLR 3.5.30729)";
return request;
}
public static string GetPayloadString(NameValueCollection form)
{
if (form ==
null)
{
return String.Empty;
}
StringBuilder buff =
new StringBuilder();
buff.Length = 0;
foreach (
string key
in form.Keys)
{
string[] values = form.GetValues(key);
if (values !=
null)
{
foreach (
string val
in values)
{
if (buff.Length > 0)
{
buff.Append("&");
}
buff.Append(HttpUtility.UrlEncode(key));
buff.Append("=");
buff.Append(HttpUtility.UrlEncode(val));
}
}
}
return buff.ToString();
}