我正试图从网络词典中获取某些单词的发音。例如,在下面的代码中,我想从http://collinsdictionary.com获得good
发音
(此处使用HTTP Agility Pack
)
static void test()
{
String url = "http://www.collinsdictionary.com/dictionary/english/good";
WebClient client = new WebClient();
client.Encoding = System.Text.Encoding.UTF8;
String html = client.DownloadString(url);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.LoadHtml(html);
HtmlAgilityPack.HtmlNode node = doc.DocumentNode.SelectSingleNode("//*[@id=\"good_1\"]/div[1]/h2/span/text()[1]");
if (node == null)
{
Console.WriteLine("XPath not found.");
}
else
{
Console.WriteLine(node.WriteTo());
}
}
我在期待
(ɡʊd
但我最多能得到的是
(É¡?d
如何做到对了?
问题不在于解析文本,而是控制台输出存在问题。如果从命令行应用程序执行此操作,则可以将控制台的输出编码设置为unicode:
Console.OutputEncoding = System.Text.Encoding.Unicode;
您还需要确保控制台中的字体是具有unicode支持的字体。有关详细信息,请参阅此答案 。
如果您知道页面编码(例如System.Text.Encoding.UTF8);
string html = DownloadSmallFiles_String(url, System.Text.Encoding.UTF8, 20000);
或使用自动编码检测(取决于服务器响应)
string html = DownloadSmallFiles_String(url, null, 20000);
最后加载html
doc.LoadHtml(html);
试试下面的代码
static void test()
{
String url = "http://www.collinsdictionary.com/dictionary/english/good";
System.Text.Encoding PageEncoding = null; //System.Text.Encoding.UTF8
//PageEncoding = null; it means try to detect encoding automatically
string html = DownloadSmallFiles_String(url, PageEncoding, 20000);
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
//doc.LoadHtml(html);
doc.LoadHtml(html);
HtmlAgilityPack.HtmlNode node = doc.DocumentNode.SelectSingleNode("//*[@id=\"good_1\"]/div[1]/h2/span/text()[1]");
if (node == null)
{
Console .WriteLine("XPath not found.");
}
else
{
Console.WriteLine(node.WriteTo());
}
}
private static HttpWebRequest CreateWebRequest(string url, int TimeOut = 20000)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko";
request.Method = "GET";
request.Timeout = TimeOut;
request.CachePolicy = new HttpRequestCachePolicy(HttpRequestCacheLevel.NoCacheNoStore);
request.KeepAlive = false;
request.UseDefaultCredentials = true;
request.Proxy = null;//ProxyHelperClass.GetIEProxy;
return request;
}
public static string DownloadSmallFiles_String(string Url, System.Text.Encoding ForceTextEncoding_SetThistoNothingToUseAutomatic, int TimeOut = 20000)
{
try
{
string ResponsString = "";
HttpWebRequest request = CreateWebRequest(Url, TimeOut);
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
if (response.StatusCode == HttpStatusCode.OK)
{
using (Stream receiveStream = response.GetResponseStream())
{
if (ForceTextEncoding_SetThistoNothingToUseAutomatic != null)
{
ResponsString = new StreamReader(receiveStream, ForceTextEncoding_SetThistoNothingToUseAutomatic).ReadToEnd();
}
else
{
if (string.IsNullOrEmpty(response.CharacterSet) == false)
{
System.Text.Encoding respEncoding = System.Text.Encoding.GetEncoding(response.CharacterSet);
ResponsString = new StreamReader(receiveStream, respEncoding).ReadToEnd();
}
else
{
ResponsString = new StreamReader(receiveStream).ReadToEnd();
}
}
}
}
}
return ResponsString;
}
catch (Exception ex)
{
return "";
}
}